From f67ad5be678278c67e8714bd08dacb38bdb0ebb2 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 30 Dec 2020 23:57:48 +0800
Subject: [PATCH] Update transformer modules (#1147)

* Add token-cls task for transformer modules

* Fix numpy dtype mismatch in windows

* Update README.md

* Update token-cls task for ernie_tiny

* Update token-cls task for ernie_tiny

* Update token-cls task for other Transformer modules

* Update README.md of modules and token-cls demo

* Add chunk_scheme arg check in SeqLabelingDataset

* Update ChunkEvaluator and paddlenlp requirement

* Update README.md

* Update token-cls demo
---
 demo/sequence_labeling/README.md              |  41 ++-
 demo/sequence_labeling/train.py               |  52 ++-
 .../language_model/bert-base-cased/README.md  |  52 +--
 .../language_model/bert-base-cased/module.py  | 215 ++++---------
 .../bert-base-chinese/README.md               |  55 ++--
 .../bert-base-chinese/module.py               | 221 ++++---------
 .../bert-base-multilingual-cased/README.md    |  55 ++--
 .../bert-base-multilingual-cased/module.py    | 221 ++++---------
 .../bert-base-multilingual-uncased/README.md  |  55 ++--
 .../bert-base-multilingual-uncased/module.py  | 222 ++++---------
 .../bert-base-uncased/README.md               |  55 ++--
 .../bert-base-uncased/module.py               | 215 ++++---------
 .../language_model/bert-large-cased/README.md |  55 ++--
 .../language_model/bert-large-cased/module.py | 214 ++++---------
 .../bert-large-uncased/README.md              |  57 ++--
 .../bert-large-uncased/module.py              | 215 ++++---------
 .../language_model/chinese_bert_wwm/README.md | 250 +++++----------
 .../chinese_bert_wwm/model/__init__.py        |   0
 .../chinese_bert_wwm/model/bert.py            | 197 ------------
 .../model/transformer_encoder.py              | 295 ------------------
 .../language_model/chinese_bert_wwm/module.py | 148 ++++++---
 .../chinese_bert_wwm_ext/README.md            | 152 +++++----
 .../chinese_bert_wwm_ext/model/__init__.py    |   0
 .../chinese_bert_wwm_ext/model/bert.py        | 197 ------------
 .../model/transformer_encoder.py              | 295 ------------------
 .../chinese_bert_wwm_ext/module.py            | 148 ++++++---
 modules/text/language_model/ernie/README.md   |  53 ++--
 modules/text/language_model/ernie/module.py   | 214 ++++---------
 .../text/language_model/ernie_tiny/README.md  |  34 +-
 .../text/language_model/ernie_tiny/module.py  |  73 ++---
 .../ernie_v2_eng_base/README.md               |  52 +--
 .../ernie_v2_eng_base/module.py               | 221 ++++---------
 .../ernie_v2_eng_large/README.md              |  54 ++--
 .../ernie_v2_eng_large/module.py              | 221 ++++---------
 modules/text/language_model/rbt3/README.md    | 152 +++++----
 .../language_model/rbt3/model/__init__.py     |   0
 .../text/language_model/rbt3/model/bert.py    | 197 ------------
 .../rbt3/model/transformer_encoder.py         | 295 ------------------
 modules/text/language_model/rbt3/module.py    | 145 ++++++---
 modules/text/language_model/rbtl3/README.md   | 152 +++++----
 .../language_model/rbtl3/model/__init__.py    |   0
 .../text/language_model/rbtl3/model/bert.py   | 197 ------------
 .../rbtl3/model/transformer_encoder.py        | 295 ------------------
 modules/text/language_model/rbtl3/module.py   | 145 ++++++---
 .../roberta-wwm-ext-large/README.md           |  56 ++--
 .../roberta-wwm-ext-large/module.py           | 219 ++++---------
 .../language_model/roberta-wwm-ext/README.md  |  56 ++--
 .../language_model/roberta-wwm-ext/module.py  | 219 ++++---------
 paddlehub/datasets/base_nlp_dataset.py        |  35 +--
 paddlehub/datasets/msra_ner.py                |  12 +-
 paddlehub/module/modeling_bert.py             | 289 -----------------
 paddlehub/module/modeling_ernie.py            | 243 ---------------
 paddlehub/module/modeling_roberta.py          | 215 -------------
 paddlehub/module/nlp_module.py                |  14 +-
 requirements.txt                              |   2 +-
 55 files changed, 2111 insertions(+), 5431 deletions(-)
 delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/__init__.py
 delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/bert.py
 delete mode 100644 modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py
 delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py
 delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/bert.py
 delete mode 100644 modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py
 delete mode 100644 modules/text/language_model/rbt3/model/__init__.py
 delete mode 100644 modules/text/language_model/rbt3/model/bert.py
 delete mode 100644 modules/text/language_model/rbt3/model/transformer_encoder.py
 delete mode 100644 modules/text/language_model/rbtl3/model/__init__.py
 delete mode 100644 modules/text/language_model/rbtl3/model/bert.py
 delete mode 100644 modules/text/language_model/rbtl3/model/transformer_encoder.py
 delete mode 100644 paddlehub/module/modeling_bert.py
 delete mode 100644 paddlehub/module/modeling_ernie.py
 delete mode 100644 paddlehub/module/modeling_roberta.py

diff --git a/demo/sequence_labeling/README.md b/demo/sequence_labeling/README.md
index fda17c32..04c3450a 100644
--- a/demo/sequence_labeling/README.md
+++ b/demo/sequence_labeling/README.md
@@ -28,10 +28,21 @@ python train.py
 使用PaddleHub Fine-tune API进行Fine-tune可以分为4个步骤。
 
 ### Step1: 选择模型
+
+在命名实体识别的任务中，因不同的数据集标识实体的标签不同，评测的方式也有所差异。因此，在初始化模型的之前，需要先确定实际标签的形式，下方的`label_list`则是MSRA-NER数据集中使用的标签类别。  
+如果用户使用的实体识别的数据集的标签方式与MSRA-NER不同，则需要自行根据数据集确定。
+```python
+label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
+label_map = {
+    idx: label for idx, label in enumerate(label_list)
+}
+```
+
+接下来创建任务所使用的`model`
 ```python
 import paddlehub as hub
 
-model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls')
+model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls', label_map=label_map)
 ```
 
 其中，参数：
@@ -40,7 +51,29 @@ model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls')
 * `version`：module版本号
 * `task`：fine-tune任务。此处为`token-cls`，表示序列标注任务。
 
-通过以上的一行代码，`model`初始化为一个适用于序列标注任务的模型，为ERNIE Tiny的预训练模型后拼接上一个输出token共享的全连接网络（Full Connected）。
+PaddleHub还提供BERT等模型可供选择, 当前支持序列标注任务的模型对应的加载示例如下：
+
+模型名                           | PaddleHub Module
+---------------------------------- | :------:
+ERNIE, Chinese                     | `hub.Module(name='ernie')`
+ERNIE tiny, Chinese                | `hub.Module(name='ernie_tiny')`
+ERNIE 2.0 Base, English            | `hub.Module(name='ernie_v2_eng_base')`
+ERNIE 2.0 Large, English           | `hub.Module(name='ernie_v2_eng_large')`
+BERT-Base, Cased                   | `hub.Module(name='bert-base-cased')`
+BERT-Base, Uncased                 | `hub.Module(name='bert-base-uncased')`
+BERT-Large, Cased                  | `hub.Module(name='bert-large-cased')`
+BERT-Large, Uncased                | `hub.Module(name='bert-large-uncased')`
+BERT-Base, Multilingual Cased      | `hub.Module(nane='bert-base-multilingual-cased')`
+BERT-Base, Multilingual Uncased    | `hub.Module(nane='bert-base-multilingual-uncased')`
+BERT-Base, Chinese                 | `hub.Module(name='bert-base-chinese')`
+BERT-wwm, Chinese                  | `hub.Module(name='chinese-bert-wwm')`
+BERT-wwm-ext, Chinese              | `hub.Module(name='chinese-bert-wwm-ext')`
+RoBERTa-wwm-ext, Chinese           | `hub.Module(name='roberta-wwm-ext')`
+RoBERTa-wwm-ext-large, Chinese     | `hub.Module(name='roberta-wwm-ext-large')`
+RBT3, Chinese                      | `hub.Module(name='rbt3')`
+RBTL3, Chinese                     | `hub.Module(name='rbtl3')`
+
+通过以上的一行代码，`model`初始化为一个适用于序列标注任务的模型，为ERNIE Tiny的预训练模型后拼接上一个输出token共享的全连接网络（Full Connected）。  
 ![](https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=224484727,3049769188&fm=15&gp=0.jpg)
 
 以上图片来自于：https://arxiv.org/pdf/1810.04805.pdf
@@ -49,9 +82,9 @@ model = hub.Module(name='ernie_tiny', version='2.0.1', task='token-cls')
 
 ```python
 train_dataset = hub.datasets.MSRA_NER(
-    tokenizer=model.get_tokenizer(tokenize_chinese_chars=True), max_seq_len=50, mode='train')
+    tokenizer=model.get_tokenizer(), max_seq_len=128, mode='train')
 dev_dataset = hub.datasets.MSRA_NER(
-    tokenizer=model.get_tokenizer(tokenize_chinese_chars=True), max_seq_len=50, mode='dev')
+    tokenizer=model.get_tokenizer(), max_seq_len=128, mode='dev')
 ```
 
 * `tokenizer`：表示该module所需用到的tokenizer，其将对输入文本完成切词，并转化成module运行所需模型输入格式。
diff --git a/demo/sequence_labeling/train.py b/demo/sequence_labeling/train.py
index 43a81fb4..3e26d20b 100644
--- a/demo/sequence_labeling/train.py
+++ b/demo/sequence_labeling/train.py
@@ -14,32 +14,60 @@
 
 import paddle
 import paddlehub as hub
+from paddlehub.datasets import MSRA_NER
+
+import ast
+import argparse
+
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
+parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False")
+parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
+parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
+parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
+parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to model checkpoint")
+parser.add_argument("--save_interval", type=int, default=1, help="Save checkpoint every n epoch.")
+
+args = parser.parse_args()
+
 
 if __name__ == '__main__':
     label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
     label_map = {
         idx: label for idx, label in enumerate(label_list)
     }
+
     model = hub.Module(
         name='ernie_tiny',
         version='2.0.1',
         task='token-cls',
-        label_map=label_map,
+        label_map=label_map,   # Required for token classification task
     )
 
-    train_dataset = hub.datasets.MSRA_NER(
-        tokenizer=model.get_tokenizer(),
-        max_seq_len=128,
+    tokenizer = model.get_tokenizer()
+    train_dataset = MSRA_NER(
+        tokenizer=tokenizer,
+        max_seq_len=args.max_seq_len,
         mode='train'
     )
-
-    dev_dataset = hub.datasets.MSRA_NER(
-        tokenizer=model.get_tokenizer(),
-        max_seq_len=50,
+    dev_dataset = MSRA_NER(
+        tokenizer=tokenizer,
+        max_seq_len=args.max_seq_len,
         mode='dev'
     )
+    test_dataset = MSRA_NER(
+        tokenizer=tokenizer,
+        max_seq_len=args.max_seq_len,
+        mode='test'
+    )
 
-    optimizer = paddle.optimizer.AdamW(learning_rate=5e-5, parameters=model.parameters())
-    trainer = hub.Trainer(model, optimizer, checkpoint_dir='token_cls_save_dir', use_gpu=True)
-
-    trainer.train(train_dataset, epochs=3, batch_size=32, eval_dataset=dev_dataset, save_interval=1)
+    optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate, parameters=model.parameters())
+    trainer = hub.Trainer(model, optimizer, checkpoint_dir=args.checkpoint_dir, use_gpu=args.use_gpu)
+    trainer.train(
+        train_dataset,
+        epochs=args.num_epoch,
+        batch_size=args.batch_size,
+        eval_dataset=dev_dataset,
+        save_interval=args.save_interval,
+    )
+    trainer.evaluate(test_dataset, batch_size=args.batch_size)
diff --git a/modules/text/language_model/bert-base-cased/README.md b/modules/text/language_model/bert-base-cased/README.md
index 2d6aac86..f75cfd72 100644
--- a/modules/text/language_model/bert-base-cased/README.md
+++ b/modules/text/language_model/bert-base-cased/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install bert-base-cased==2.0.0
+$ hub install bert-base-cased==2.0.1
 ```
 
 <p align="center">
@@ -14,23 +14,29 @@ $ hub install bert-base-cased==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
@@ -45,7 +51,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -54,7 +62,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -68,16 +78,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='bert-base-cased',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -85,7 +95,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -111,12 +123,12 @@ $ hub serving start -m bert-base-cased
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/bert-base-cased"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -149,3 +161,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/bert-base-cased/module.py b/modules/text/language_model/bert-base-cased/module.py
index 92a8b7d2..8b7b75d5 100644
--- a/modules/text/language_model/bert-base-cased/module.py
+++ b/modules/text/language_model/bert-base-cased/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="bert-base-cased",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "bert_cased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Bert(nn.Layer):
     """
     BERT model
@@ -41,181 +43,80 @@ class Bert(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Bert, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
-            self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased')
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased', num_classes=self.num_classes, **kwargs)
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-base-cased', num_classes=self.num_classes, **kwargs)
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased')
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'bert-base-cased', 'bert-base-cased-vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'bert-base-cased'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            encoded_inputs = tokenizer.encode(text, pad_to_max_seq_len=False)
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-base-cased', *args, **kwargs)
diff --git a/modules/text/language_model/bert-base-chinese/README.md b/modules/text/language_model/bert-base-chinese/README.md
index d13c35db..3d9d31de 100644
--- a/modules/text/language_model/bert-base-chinese/README.md
+++ b/modules/text/language_model/bert-base-chinese/README.md
@@ -1,6 +1,7 @@
 ```shell
-$ hub install bert-base-chinese==2.0.0
+$ hub install bert-base-chinese==2.0.1
 ```
+
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
 </p>
@@ -13,29 +14,35 @@ $ hub install bert-base-chinese==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +51,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +62,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +78,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='bert-base-chinese',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +95,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +123,12 @@ $ hub serving start -m bert-base-chinese
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/bert-base-chinese"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -148,3 +161,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/bert-base-chinese/module.py b/modules/text/language_model/bert-base-chinese/module.py
index f6c63c35..bb8cca19 100644
--- a/modules/text/language_model/bert-base-chinese/module.py
+++ b/modules/text/language_model/bert-base-chinese/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="bert-base-chinese",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "bert_chinese_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Bert(nn.Layer):
     """
     Bert model
@@ -41,181 +43,88 @@ class Bert(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Bert, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
             self.model = BertForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='bert-base-chinese')
+                pretrained_model_name_or_path='bert-base-chinese',
+                num_classes=self.num_classes,
+                **kwargs
+            )
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='bert-base-chinese',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-chinese')
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-chinese', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'bert-base-chinese', 'bert-base-chinese-vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'bert-base-chinese'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-base-chinese', *args, **kwargs)
diff --git a/modules/text/language_model/bert-base-multilingual-cased/README.md b/modules/text/language_model/bert-base-multilingual-cased/README.md
index cfeccad4..a6881ca2 100644
--- a/modules/text/language_model/bert-base-multilingual-cased/README.md
+++ b/modules/text/language_model/bert-base-multilingual-cased/README.md
@@ -1,6 +1,7 @@
 ```shell
-$ hub install bert-base-multilingual-cased==2.0.0
+$ hub install bert-base-multilingual-cased==2.0.1
 ```
+
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
 </p>
@@ -13,29 +14,35 @@ $ hub install bert-base-multilingual-cased==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +51,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +62,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +78,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='bert-base-multilingual-cased',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +95,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +123,12 @@ $ hub serving start -m bert-base-multilingual-cased
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/bert-base-multilingual-cased"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -148,3 +161,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/bert-base-multilingual-cased/module.py b/modules/text/language_model/bert-base-multilingual-cased/module.py
index d164ba53..124a0ce4 100644
--- a/modules/text/language_model/bert-base-multilingual-cased/module.py
+++ b/modules/text/language_model/bert-base-multilingual-cased/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="bert-base-multilingual-cased",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "bert_multi_cased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Bert(nn.Layer):
     """
     BERT model
@@ -41,181 +43,88 @@ class Bert(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Bert, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
             self.model = BertForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='bert-base-multilingual-cased')
+                pretrained_model_name_or_path='bert-base-multilingual-cased',
+                num_classes=self.num_classes,
+                **kwargs
+            )
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='bert-base-multilingual-cased',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-cased')
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-cased', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'bert-base-multilingual-cased', 'bert-base-multilingual-cased-vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'bert-base-multilingual-cased'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-base-multilingual-cased', *args, **kwargs)
diff --git a/modules/text/language_model/bert-base-multilingual-uncased/README.md b/modules/text/language_model/bert-base-multilingual-uncased/README.md
index 4f083b93..079b2a2b 100644
--- a/modules/text/language_model/bert-base-multilingual-uncased/README.md
+++ b/modules/text/language_model/bert-base-multilingual-uncased/README.md
@@ -1,6 +1,7 @@
 ```shell
-$ hub install bert-base-multilingual-uncased==2.0.0
+$ hub install bert-base-multilingual-uncased==2.0.1
 ```
+
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
 </p>
@@ -13,29 +14,35 @@ $ hub install bert-base-multilingual-uncased==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +51,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +62,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +78,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='bert-base-multilingual-uncased',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +95,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +123,12 @@ $ hub serving start -m bert-base-multilingual-uncased
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/bert-base-multilingual-uncased"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -148,3 +161,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/bert-base-multilingual-uncased/module.py b/modules/text/language_model/bert-base-multilingual-uncased/module.py
index 410a8129..c957d7e3 100644
--- a/modules/text/language_model/bert-base-multilingual-uncased/module.py
+++ b/modules/text/language_model/bert-base-multilingual-uncased/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="bert-base-multilingual-uncased",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "bert_multi_uncased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Bert(nn.Layer):
     """
     BERT model
@@ -41,182 +43,88 @@ class Bert(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Bert, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
             self.model = BertForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='bert-base-multilingual-uncased')
+                pretrained_model_name_or_path='bert-base-multilingual-uncased',
+                num_classes=self.num_classes,
+                **kwargs
+            )
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='bert-base-multilingual-uncased',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-uncased')
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-uncased', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'bert-base-multilingual-uncased',
-                                 'bert-base-multilingual-uncased-vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'bert-base-multilingual-uncased'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-base-multilingual-uncased', *args, **kwargs)
diff --git a/modules/text/language_model/bert-base-uncased/README.md b/modules/text/language_model/bert-base-uncased/README.md
index 2b2aa202..dfb5e864 100644
--- a/modules/text/language_model/bert-base-uncased/README.md
+++ b/modules/text/language_model/bert-base-uncased/README.md
@@ -1,6 +1,7 @@
 ```shell
-$ hub install bert-base-uncased==2.0.0
+$ hub install bert-base-uncased==2.0.1
 ```
+
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
 </p>
@@ -13,29 +14,35 @@ $ hub install bert-base-uncased==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +51,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +62,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +78,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='bert-base-uncased',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +95,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +123,12 @@ $ hub serving start -m bert-base-uncased
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/bert-base-uncased"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -148,3 +161,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/bert-base-uncased/module.py b/modules/text/language_model/bert-base-uncased/module.py
index 98c49b03..8c06ad34 100644
--- a/modules/text/language_model/bert-base-uncased/module.py
+++ b/modules/text/language_model/bert-base-uncased/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="bert-base-uncased",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "bert_uncased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Bert(nn.Layer):
     """
     BERT model
@@ -41,181 +43,80 @@ class Bert(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Bert, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
-            self.model = BertForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='bert-base-uncased')
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', num_classes=self.num_classes, **kwargs)
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', num_classes=self.num_classes, **kwargs)
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-uncased')
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'bert-base-uncased', 'bert-base-uncased-vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'bert-base-uncased'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-base-uncased', *args, **kwargs)
diff --git a/modules/text/language_model/bert-large-cased/README.md b/modules/text/language_model/bert-large-cased/README.md
index 54219b12..344d5441 100644
--- a/modules/text/language_model/bert-large-cased/README.md
+++ b/modules/text/language_model/bert-large-cased/README.md
@@ -1,6 +1,7 @@
 ```shell
-$ hub install bert-large-cased==2.0.0
+$ hub install bert-large-cased==2.0.1
 ```
+
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
 </p>
@@ -13,29 +14,35 @@ $ hub install bert-large-cased==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +51,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +62,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +78,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='bert-large-cased',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +95,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +123,12 @@ $ hub serving start -m bert-large-cased
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/bert-large-cased"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -148,3 +161,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/bert-large-cased/module.py b/modules/text/language_model/bert-large-cased/module.py
index 58b03010..d456b78f 100644
--- a/modules/text/language_model/bert-large-cased/module.py
+++ b/modules/text/language_model/bert-large-cased/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="bert-large-cased",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "bert_cased_L-24_H-1024_A-16, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Bert(nn.Layer):
     """
     BERT model
@@ -41,180 +43,80 @@ class Bert(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Bert, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
-            self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-large-cased')
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-large-cased', num_classes=self.num_classes, **kwargs)
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-large-cased', num_classes=self.num_classes, **kwargs)
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-cased')
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-cased', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'bert-large-cased', 'bert-large-cased-vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'bert-large-cased'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-large-cased', *args, **kwargs)
diff --git a/modules/text/language_model/bert-large-uncased/README.md b/modules/text/language_model/bert-large-uncased/README.md
index 4520f37b..e2964f85 100644
--- a/modules/text/language_model/bert-large-uncased/README.md
+++ b/modules/text/language_model/bert-large-uncased/README.md
@@ -1,6 +1,7 @@
 ```shell
-$ hub install bert-large-uncased==2.0.0
+$ hub install bert-large-uncased==2.0.1
 ```
+
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
 </p>
@@ -13,29 +14,35 @@ $ hub install bert-large-uncased==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +51,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +62,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +78,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
-    name='bert-large-画丶cased',
-    version='2.0.0',
-    task='sequence_classification',
+    name='bert-large-uncased',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +95,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +123,12 @@ $ hub serving start -m bert-large-uncased
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/bert-large-uncased"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -148,3 +161,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/bert-large-uncased/module.py b/modules/text/language_model/bert-large-uncased/module.py
index 57020f07..cedcba1d 100644
--- a/modules/text/language_model/bert-large-uncased/module.py
+++ b/modules/text/language_model/bert-large-uncased/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_bert import BertForSequenceClassification, BertModel
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="bert-large-uncased",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "bert_uncased_L-24_H-1024_A-16, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Bert(nn.Layer):
     """
     BERT model
@@ -41,181 +43,80 @@ class Bert(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Bert, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
-            self.model = BertForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='bert-large-uncased')
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', num_classes=self.num_classes, **kwargs)
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', num_classes=self.num_classes, **kwargs)
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-uncased')
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'bert-large-uncased', 'bert-large-uncased-vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'bert-large-uncased'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-large-uncased', *args, **kwargs)
diff --git a/modules/text/language_model/chinese_bert_wwm/README.md b/modules/text/language_model/chinese_bert_wwm/README.md
index 96ae17ac..6f0460c5 100644
--- a/modules/text/language_model/chinese_bert_wwm/README.md
+++ b/modules/text/language_model/chinese_bert_wwm/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install chinese-bert-wwm==1.0.0
+$ hub install chinese-bert-wwm==2.0.1
 ```
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
@@ -9,56 +9,50 @@ $ hub install chinese-bert-wwm==1.0.0
 
 ## API
 ```python
-def context(
-    trainable=True,
-    max_seq_len=128
+def __init__(
+    task=None,
+    load_checkpoint=None,
+    label_map=None,
+    num_classes=2,
+    **kwargs,
 )
 ```
-用于获取Module的上下文信息，得到输入、输出以及预训练的Paddle Program副本  
 
-**参数**  
-```shell
-$ hub install chinese-bert-wwm==1.0.0
-```
-<p align="center">
-<img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
-</p>
+创建Module对象（动态图组网版本）。
 
-更多详情请参考[BERT论文](https://arxiv.org/abs/1810.04805), [Chinese-BERT-wwm技术报告](https://arxiv.org/abs/1906.08101)
+**参数**
+
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
+* `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
+* `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
-## API
 ```python
-def context(
-    trainable=True,
-    max_seq_len=128
+def predict(
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
 )
 ```
-用于获取Module的上下文信息，得到输入、输出以及预训练的Paddle Program副本
 
 **参数**
 
-> trainable：设置为True时，Module中的参数在Fine-tune时也会随之训练，否则保持不变。
-> max_seq_len：BERT模型的最大序列长度，若序列长度不足，会通过padding方式补到**max_seq_len**, 若序列长度大于该值，则会以截断方式让序列长度为**max_seq_len**，max_seq_len可取值范围为0～512；
+* `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+* `max_seq_len`：模型处理文本的最大长度
+* `batch_size`：模型批处理大小
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
-> inputs：dict类型，有以下字段：
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置，shape为\[batch_size, max_seq_len\]，int64类型；
-> >**segment_ids**存放各token所在文本的标识（token属于文本1或者文本2），shape为\[batch_size, max_seq_len\]，int64类型；
-> >**input_mask**存放token是否为padding的标识，shape为\[batch_size, max_seq_len\]，int64类型；
->
-> outputs：dict类型，Module的输出特征，有以下字段：
-> >**pooled_output**字段存放句子粒度的特征，可用于文本分类等任务，shape为 \[batch_size, 768\]，int64类型；
-> >**sequence_output**字段存放字粒度的特征，可用于序列标注等任务，shape为 \[batch_size, seq_len, 768\]，int64类型；
->
-> program：包含该Module计算图的Program。
-
-
 
 ```python
 def get_embedding(
-    texts,
-    use_gpu=False,
-    batch_size=1
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
 )
 ```
 
@@ -66,159 +60,79 @@ def get_embedding(
 
 **参数**
 
-> texts：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
-> use_gpu：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
 
-> results：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
-```python
-def get_params_layer()
-```
-
-用于获取参数层信息，该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+* `results`：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
 
-**参数**
-
-> 无
-
-**返回**
-
-> params_layer：dict类型，key为参数名，值为参数所在层数
 
 **代码示例**
 
 ```python
 import paddlehub as hub
 
-# Load $ hub install chinese-bert-wwm pretrained model
-module = hub.Module(name="chinese-bert-wwm")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
-
-# Must feed all the tensor of chinese-bert-wwm's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
-
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
-
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
-
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
-
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+data = [
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
+]
+label_map = {0: 'negative', 1: 'positive'}
+
+model = hub.Module(
+    name='chinese-bert-wwm',
+    version='2.0.1',
+    task='seq-cls',
+    load_checkpoint='/path/to/parameters',
+    label_map=label_map)
+results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
+for idx, text in enumerate(data):
+    print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-## 查看代码
-https://github.com/ymcui/Chinese-BERT-wwm
-
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
-## 贡献者
+## 服务部署
 
-[ymcui](https://github.com/ymcui)
+PaddleHub Serving可以部署一个在线获取预训练词向量。
 
-## 依赖
+### Step1: 启动PaddleHub Serving
 
-paddlepaddle >= 1.6.2
+运行启动命令：
 
-paddlehub >= 1.6.0
-
-## 更新历史
-
-* 1.0.0
-
-  初始发布
-
-* 1.0.0
-
-  支持get_embedding与get_params_layer
-
-> trainable：设置为True时，Module中的参数在Fine-tune时也会随之训练，否则保持不变。  
-> max_seq_len：BERT模型的最大序列长度，若序列长度不足，会通过padding方式补到**max_seq_len**, 若序列长度大于该值，则会以截断方式让序列长度为**max_seq_len**，max_seq_len可取值范围为0～512；  
-
-**返回**  
-> inputs：dict类型，有以下字段：  
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置，shape为\[batch_size, max_seq_len\]，int64类型；
-> >**segment_ids**存放各token所在文本的标识（token属于文本1或者文本2），shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**input_mask**存放token是否为padding的标识，shape为\[batch_size, max_seq_len\]，int64类型；  
->
-> outputs：dict类型，Module的输出特征，有以下字段：  
-> >**pooled_output**字段存放句子粒度的特征，可用于文本分类等任务，shape为 \[batch_size, 768\]，int64类型；  
-> >**sequence_output**字段存放字粒度的特征，可用于序列标注等任务，shape为 \[batch_size, seq_len, 768\]，int64类型；  
->
-> program：包含该Module计算图的Program。  
-
-
-
-```python
-def get_embedding(
-    texts,
-    use_gpu=False,
-    batch_size=1
-)
-```
-
-用于获取输入文本的句子粒度特征与字粒度特征
-
-**参数**
-
-> texts：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
-> use_gpu：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
-
-**返回**
-
-> results：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
-```python
-def get_params_layer()
+```shell
+$ hub serving start -m chinese-bert-wwm
 ```
 
-用于获取参数层信息，该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+这样就完成了一个获取预训练词向量服务化API的部署，默认端口号为8866。
 
-**参数**
-
-> 无
-
-**返回**
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
 
-> params_layer：dict类型，key为参数名，值为参数所在层数
+### Step2: 发送预测请求
 
-**代码示例**
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
 
 ```python
-import paddlehub as hub
-
-# Load $ hub install chinese-bert-wwm pretrained model
-module = hub.Module(name="chinese-bert-wwm")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
-
-# Must feed all the tensor of chinese-bert-wwm's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
-
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
-
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
-
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
-
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+import requests
+import json
+
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
+url = "http://10.12.121.132:8866/predict/chinese-bert-wwm"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
 ```
 
 ## 查看代码
@@ -231,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm
 
 ## 依赖
 
-paddlepaddle >= 1.6.2
+paddlepaddle >= 2.0.0
 
-paddlehub >= 1.6.0
+paddlehub >= 2.0.0
 
 ## 更新历史
 
 * 1.0.0
 
   初始发布
+  
+* 2.0.1
+
+  全面升级动态图，接口有所变化。任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/chinese_bert_wwm/model/__init__.py b/modules/text/language_model/chinese_bert_wwm/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/language_model/chinese_bert_wwm/model/bert.py b/modules/text/language_model/chinese_bert_wwm/model/bert.py
deleted file mode 100644
index 819bdbad..00000000
--- a/modules/text/language_model/chinese_bert_wwm/model/bert.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-
-import paddle.fluid as fluid
-
-from chinese_bert_wwm.model.transformer_encoder import encoder, pre_process_layer
-
-
-class BertConfig(object):
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except Exception:
-            raise IOError("Error in parsing bert model config file '%s'" % config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class BertModel(object):
-    def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
-
-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self._weight_sharing = weight_sharing
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
-
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
-
-        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-
-    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
-        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(input=src_ids,
-                                         size=[self._voc_size, self._emb_size],
-                                         dtype=self._dtype,
-                                         param_attr=fluid.ParamAttr(name=self._word_emb_name,
-                                                                    initializer=self._param_initializer),
-                                         is_sparse=False)
-        position_emb_out = fluid.layers.embedding(input=position_ids,
-                                                  size=[self._max_position_seq_len, self._emb_size],
-                                                  dtype=self._dtype,
-                                                  param_attr=fluid.ParamAttr(name=self._pos_emb_name,
-                                                                             initializer=self._param_initializer))
-
-        sent_emb_out = fluid.layers.embedding(sentence_ids,
-                                              size=[self._sent_types, self._emb_size],
-                                              dtype=self._dtype,
-                                              param_attr=fluid.ParamAttr(name=self._sent_emb_name,
-                                                                         initializer=self._param_initializer))
-
-        emb_out = emb_out + position_emb_out
-        emb_out = emb_out + sent_emb_out
-
-        emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-
-        if self._dtype == "float16":
-            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-
-        self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-
-        self._enc_out = encoder(enc_input=emb_out,
-                                attn_bias=n_head_self_attn_mask,
-                                n_layer=self._n_layer,
-                                n_head=self._n_head,
-                                d_key=self._emb_size // self._n_head,
-                                d_value=self._emb_size // self._n_head,
-                                d_model=self._emb_size,
-                                d_inner_hid=self._emb_size * 4,
-                                prepostprocess_dropout=self._prepostprocess_dropout,
-                                attention_dropout=self._attention_dropout,
-                                relu_dropout=0,
-                                hidden_act=self._hidden_act,
-                                preprocess_cmd="",
-                                postprocess_cmd="dan",
-                                param_initializer=self._param_initializer,
-                                name='encoder')
-
-    def get_sequence_output(self):
-        return self._enc_out
-
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-
-        next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = fluid.layers.fc(input=next_sent_feat,
-                                         size=self._emb_size,
-                                         act="tanh",
-                                         param_attr=fluid.ParamAttr(name="pooled_fc.w_0",
-                                                                    initializer=self._param_initializer),
-                                         bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-
-    def get_pretraining_output(self, mask_label, mask_pos, labels):
-        """Get the loss & accuracy for pretraining"""
-
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
-        # extract the first token feature in each sentence
-        next_sent_feat = self.get_pooled_output()
-        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
-        # transform: fc
-        mask_trans_feat = fluid.layers.fc(input=mask_feat,
-                                          size=self._emb_size,
-                                          act=self._hidden_act,
-                                          param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
-                                                                     initializer=self._param_initializer),
-                                          bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-        # transform: layer norm
-        mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
-                                                initializer=fluid.initializer.Constant(value=0.0))
-        if self._weight_sharing:
-            fc_out = fluid.layers.matmul(x=mask_trans_feat,
-                                         y=fluid.default_main_program().global_block().var(self._word_emb_name),
-                                         transpose_y=True)
-            fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
-                                                    dtype=self._dtype,
-                                                    attr=mask_lm_out_bias_attr,
-                                                    is_bias=True)
-
-        else:
-            fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                     size=self._voc_size,
-                                     param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
-                                                                initializer=self._param_initializer),
-                                     bias_attr=mask_lm_out_bias_attr)
-
-        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-
-        next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
-                                           size=2,
-                                           param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
-                                                                      initializer=self._param_initializer),
-                                           bias_attr="next_sent_fc.b_0")
-
-        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
-                                                                                    label=labels,
-                                                                                    return_softmax=True)
-
-        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
-
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-
-        loss = mean_next_sent_loss + mean_mask_lm_loss
-        return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py b/modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py
deleted file mode 100644
index b15d8388..00000000
--- a/modules/text/language_model/chinese_bert_wwm/model/transformer_encoder.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(weights,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(hidden,
-                                dropout_prob=dropout_rate,
-                                dropout_implementation="upscale_in_train",
-                                is_test=False)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float32")
-            out = layers.layer_norm(out,
-                                    begin_norm_axis=len(out.shape) - 1,
-                                    param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale',
-                                                               initializer=fluid.initializer.Constant(1.)),
-                                    bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias',
-                                                              initializer=fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(out,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(pre_process_layer(enc_input,
-                                                         preprocess_cmd,
-                                                         prepostprocess_dropout,
-                                                         name=name + '_pre_att'),
-                                       None,
-                                       None,
-                                       attn_bias,
-                                       d_key,
-                                       d_value,
-                                       d_model,
-                                       n_head,
-                                       attention_dropout,
-                                       param_initializer=param_initializer,
-                                       name=name + '_multi_head_att')
-    attn_output = post_process_layer(enc_input,
-                                     attn_output,
-                                     postprocess_cmd,
-                                     prepostprocess_dropout,
-                                     name=name + '_post_att')
-    ffd_output = positionwise_feed_forward(pre_process_layer(attn_output,
-                                                             preprocess_cmd,
-                                                             prepostprocess_dropout,
-                                                             name=name + '_pre_ffn'),
-                                           d_inner_hid,
-                                           d_model,
-                                           relu_dropout,
-                                           hidden_act,
-                                           param_initializer=param_initializer,
-                                           name=name + '_ffn')
-    return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn')
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name=''):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(enc_input,
-                                   attn_bias,
-                                   n_head,
-                                   d_key,
-                                   d_value,
-                                   d_model,
-                                   d_inner_hid,
-                                   prepostprocess_dropout,
-                                   attention_dropout,
-                                   relu_dropout,
-                                   hidden_act,
-                                   preprocess_cmd,
-                                   postprocess_cmd,
-                                   param_initializer=param_initializer,
-                                   name=name + '_layer_' + str(i))
-        enc_input = enc_output
-    enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
-    return enc_output
diff --git a/modules/text/language_model/chinese_bert_wwm/module.py b/modules/text/language_model/chinese_bert_wwm/module.py
index 70ea366d..3ee03088 100644
--- a/modules/text/language_model/chinese_bert_wwm/module.py
+++ b/modules/text/language_model/chinese_bert_wwm/module.py
@@ -1,7 +1,6 @@
-# coding:utf-8
-# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -12,62 +11,121 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+from typing import Dict
 import os
+import math
 
-from paddlehub import TransformerModule
-from paddlehub.module.module import moduleinfo
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
 
-from chinese_bert_wwm.model.bert import BertConfig, BertModel
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
+from paddlehub.utils.log import logger
 
 
 @moduleinfo(
     name="chinese-bert-wwm",
-    version="1.0.0",
-    summary="chinese-bert-wwm, 12-layer, 768-hidden, 12-heads, 110M parameters ",
+    version="2.0.1",
+    summary=
+    "chinese-bert-wwm, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
     author="ymcui",
     author_email="ymcui@ir.hit.edu.cn",
     type="nlp/semantic_model",
+    meta=TransformerModule
 )
-class BertWwm(TransformerModule):
-    def _initialize(self):
-        self.MAX_SEQ_LEN = 512
-        self.params_path = os.path.join(self.directory, "assets", "params")
-        self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+class BertWwm(nn.Layer):
+    """
+    BertWwm model
+    """
 
-        bert_config_path = os.path.join(self.directory, "assets", "bert_config.json")
-        self.bert_config = BertConfig(bert_config_path)
+    def __init__(
+            self,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
+    ):
+        super(BertWwm, self).__init__()
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
 
-    def net(self, input_ids, position_ids, segment_ids, input_mask):
-        """
-        create neural network.
+        if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = BertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path='bert-wwm-chinese',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='bert-wwm-chinese',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
+        elif task is None:
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-chinese', **kwargs)
+        else:
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
-        Args:
-            input_ids (tensor): the word ids.
-            position_ids (tensor): the position ids.
-            segment_ids (tensor): the segment ids.
-            input_mask (tensor): the padding mask.
+        self.task = task
 
-        Returns:
-            pooled_output (tensor):  sentence-level output for classification task.
-            sequence_output (tensor): token-level output for sequence task.
-        """
-        bert = BertModel(src_ids=input_ids,
-                         position_ids=position_ids,
-                         sentence_ids=segment_ids,
-                         input_mask=input_mask,
-                         config=self.bert_config,
-                         use_fp16=False)
-        pooled_output = bert.get_pooled_output()
-        sequence_output = bert.get_sequence_output()
-        return pooled_output, sequence_output
+        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
+            state_dict = paddle.load(load_checkpoint)
+            self.set_state_dict(state_dict)
+            logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
+        result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
+        if self.task == 'seq-cls':
+            logits = result
+            probs = F.softmax(logits, axis=1)
+            if labels is not None:
+                loss = self.criterion(logits, labels)
+                correct = self.metric.compute(probs, labels)
+                acc = self.metric.update(correct)
+                return probs, loss, {'acc': acc}
+            return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
+        else:
+            sequence_output, pooled_output = result
+            return sequence_output, pooled_output
 
-if __name__ == '__main__':
-    test_module = BertWwm()
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
+        """
+        Gets the tokenizer that is customized for this module.
+        """
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-wwm-chinese', *args, **kwargs)
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/README.md b/modules/text/language_model/chinese_bert_wwm_ext/README.md
index 79d742a5..03357e3c 100644
--- a/modules/text/language_model/chinese_bert_wwm_ext/README.md
+++ b/modules/text/language_model/chinese_bert_wwm_ext/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install chinese-bert-wwm-ext==1.0.0
+$ hub install chinese-bert-wwm-ext==2.0.1
 ```
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
@@ -9,94 +9,130 @@ $ hub install chinese-bert-wwm-ext==1.0.0
 
 ## API
 ```python
-def context(
-    trainable=True,
-    max_seq_len=128
+def __init__(
+    task=None,
+    load_checkpoint=None,
+    label_map=None,
+    num_classes=2,
+    **kwargs,
 )
 ```
-用于获取Module的上下文信息，得到输入、输出以及预训练的Paddle Program副本  
 
-**参数**  
-
-> trainable：设置为True时，Module中的参数在Fine-tune时也会随之训练，否则保持不变。  
-> max_seq_len：BERT模型的最大序列长度，若序列长度不足，会通过padding方式补到**max_seq_len**, 若序列长度大于该值，则会以截断方式让序列长度为**max_seq_len**，max_seq_len可取值范围为0～512；  
-
-**返回**  
-> inputs：dict类型，有以下字段：  
-> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids， shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置，shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**segment_ids**存放各token所在文本的标识（token属于文本1或者文本2），shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**input_mask**存放token是否为padding的标识，shape为\[batch_size, max_seq_len\]，int64类型；  
->
-> outputs：dict类型，Module的输出特征，有以下字段：  
-> >**pooled_output**字段存放句子粒度的特征，可用于文本分类等任务，shape为 \[batch_size, 768\]，int64类型；  
-> >**sequence_output**字段存放字粒度的特征，可用于序列标注等任务，shape为 \[batch_size, seq_len, 768\]，int64类型；  
->
-> program：包含该Module计算图的Program。  
+创建Module对象（动态图组网版本）。
 
+**参数**
 
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
+* `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
+* `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
-def get_embedding(
-    texts,
-    use_gpu=False,
-    batch_size=1
+def predict(
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
 )
 ```
 
-用于获取输入文本的句子粒度特征与字粒度特征
-
 **参数**
 
-> texts：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
-> use_gpu：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
+* `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+* `max_seq_len`：模型处理文本的最大长度
+* `batch_size`：模型批处理大小
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
 
-> results：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
 ```python
-def get_params_layer()
+def get_embedding(
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
+)
 ```
 
-用于获取参数层信息，该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+用于获取输入文本的句子粒度特征与字粒度特征
 
 **参数**
 
-> 无
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
 
-> params_layer：dict类型，key为参数名，值为参数所在层数
+* `results`：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
+
 
 **代码示例**
 
 ```python
 import paddlehub as hub
 
-# Load $ hub install chinese-bert-wwm-ext pretrained model
-module = hub.Module(name="chinese-bert-wwm-ext")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
+data = [
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
+]
+label_map = {0: 'negative', 1: 'positive'}
+
+model = hub.Module(
+    name='chinese-bert-wwm-ext',
+    version='2.0.1',
+    task='seq-cls',
+    load_checkpoint='/path/to/parameters',
+    label_map=label_map)
+results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
+for idx, text in enumerate(data):
+    print('Data: {} \t Lable: {}'.format(text, results[idx]))
+```
 
-# Must feed all the tensor of chinese-bert-wwm-ext's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
+## 服务部署
 
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
+PaddleHub Serving可以部署一个在线获取预训练词向量。
 
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
+### Step1: 启动PaddleHub Serving
 
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+运行启动命令：
+
+```shell
+$ hub serving start -m chinese-bert-wwm-ext
+```
+
+这样就完成了一个获取预训练词向量服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
+url = "http://10.12.121.132:8866/predict/chinese-bert-wwm-ext"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
 ```
 
 ## 查看代码
@@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm
 
 ## 依赖
 
-paddlepaddle >= 1.6.2
+paddlepaddle >= 2.0.0
 
-paddlehub >= 1.6.0
+paddlehub >= 2.0.0
 
 ## 更新历史
 
 * 1.0.0
 
   初始发布
+
+* 2.0.1
+
+  全面升级动态图，接口有所变化。任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py b/modules/text/language_model/chinese_bert_wwm_ext/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py b/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py
deleted file mode 100644
index cf2a32c1..00000000
--- a/modules/text/language_model/chinese_bert_wwm_ext/model/bert.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-
-import paddle.fluid as fluid
-
-from chinese_bert_wwm_ext.model.transformer_encoder import encoder, pre_process_layer
-
-
-class BertConfig(object):
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except Exception:
-            raise IOError("Error in parsing bert model config file '%s'" % config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class BertModel(object):
-    def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
-
-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self._weight_sharing = weight_sharing
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
-
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
-
-        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-
-    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
-        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(input=src_ids,
-                                         size=[self._voc_size, self._emb_size],
-                                         dtype=self._dtype,
-                                         param_attr=fluid.ParamAttr(name=self._word_emb_name,
-                                                                    initializer=self._param_initializer),
-                                         is_sparse=False)
-        position_emb_out = fluid.layers.embedding(input=position_ids,
-                                                  size=[self._max_position_seq_len, self._emb_size],
-                                                  dtype=self._dtype,
-                                                  param_attr=fluid.ParamAttr(name=self._pos_emb_name,
-                                                                             initializer=self._param_initializer))
-
-        sent_emb_out = fluid.layers.embedding(sentence_ids,
-                                              size=[self._sent_types, self._emb_size],
-                                              dtype=self._dtype,
-                                              param_attr=fluid.ParamAttr(name=self._sent_emb_name,
-                                                                         initializer=self._param_initializer))
-
-        emb_out = emb_out + position_emb_out
-        emb_out = emb_out + sent_emb_out
-
-        emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-
-        if self._dtype == "float16":
-            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-
-        self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-
-        self._enc_out = encoder(enc_input=emb_out,
-                                attn_bias=n_head_self_attn_mask,
-                                n_layer=self._n_layer,
-                                n_head=self._n_head,
-                                d_key=self._emb_size // self._n_head,
-                                d_value=self._emb_size // self._n_head,
-                                d_model=self._emb_size,
-                                d_inner_hid=self._emb_size * 4,
-                                prepostprocess_dropout=self._prepostprocess_dropout,
-                                attention_dropout=self._attention_dropout,
-                                relu_dropout=0,
-                                hidden_act=self._hidden_act,
-                                preprocess_cmd="",
-                                postprocess_cmd="dan",
-                                param_initializer=self._param_initializer,
-                                name='encoder')
-
-    def get_sequence_output(self):
-        return self._enc_out
-
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-
-        next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = fluid.layers.fc(input=next_sent_feat,
-                                         size=self._emb_size,
-                                         act="tanh",
-                                         param_attr=fluid.ParamAttr(name="pooled_fc.w_0",
-                                                                    initializer=self._param_initializer),
-                                         bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-
-    def get_pretraining_output(self, mask_label, mask_pos, labels):
-        """Get the loss & accuracy for pretraining"""
-
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
-        # extract the first token feature in each sentence
-        next_sent_feat = self.get_pooled_output()
-        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
-        # transform: fc
-        mask_trans_feat = fluid.layers.fc(input=mask_feat,
-                                          size=self._emb_size,
-                                          act=self._hidden_act,
-                                          param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
-                                                                     initializer=self._param_initializer),
-                                          bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-        # transform: layer norm
-        mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
-                                                initializer=fluid.initializer.Constant(value=0.0))
-        if self._weight_sharing:
-            fc_out = fluid.layers.matmul(x=mask_trans_feat,
-                                         y=fluid.default_main_program().global_block().var(self._word_emb_name),
-                                         transpose_y=True)
-            fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
-                                                    dtype=self._dtype,
-                                                    attr=mask_lm_out_bias_attr,
-                                                    is_bias=True)
-
-        else:
-            fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                     size=self._voc_size,
-                                     param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
-                                                                initializer=self._param_initializer),
-                                     bias_attr=mask_lm_out_bias_attr)
-
-        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-
-        next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
-                                           size=2,
-                                           param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
-                                                                      initializer=self._param_initializer),
-                                           bias_attr="next_sent_fc.b_0")
-
-        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
-                                                                                    label=labels,
-                                                                                    return_softmax=True)
-
-        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
-
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-
-        loss = mean_next_sent_loss + mean_mask_lm_loss
-        return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py b/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py
deleted file mode 100644
index b15d8388..00000000
--- a/modules/text/language_model/chinese_bert_wwm_ext/model/transformer_encoder.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(weights,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(hidden,
-                                dropout_prob=dropout_rate,
-                                dropout_implementation="upscale_in_train",
-                                is_test=False)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float32")
-            out = layers.layer_norm(out,
-                                    begin_norm_axis=len(out.shape) - 1,
-                                    param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale',
-                                                               initializer=fluid.initializer.Constant(1.)),
-                                    bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias',
-                                                              initializer=fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(out,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(pre_process_layer(enc_input,
-                                                         preprocess_cmd,
-                                                         prepostprocess_dropout,
-                                                         name=name + '_pre_att'),
-                                       None,
-                                       None,
-                                       attn_bias,
-                                       d_key,
-                                       d_value,
-                                       d_model,
-                                       n_head,
-                                       attention_dropout,
-                                       param_initializer=param_initializer,
-                                       name=name + '_multi_head_att')
-    attn_output = post_process_layer(enc_input,
-                                     attn_output,
-                                     postprocess_cmd,
-                                     prepostprocess_dropout,
-                                     name=name + '_post_att')
-    ffd_output = positionwise_feed_forward(pre_process_layer(attn_output,
-                                                             preprocess_cmd,
-                                                             prepostprocess_dropout,
-                                                             name=name + '_pre_ffn'),
-                                           d_inner_hid,
-                                           d_model,
-                                           relu_dropout,
-                                           hidden_act,
-                                           param_initializer=param_initializer,
-                                           name=name + '_ffn')
-    return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn')
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name=''):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(enc_input,
-                                   attn_bias,
-                                   n_head,
-                                   d_key,
-                                   d_value,
-                                   d_model,
-                                   d_inner_hid,
-                                   prepostprocess_dropout,
-                                   attention_dropout,
-                                   relu_dropout,
-                                   hidden_act,
-                                   preprocess_cmd,
-                                   postprocess_cmd,
-                                   param_initializer=param_initializer,
-                                   name=name + '_layer_' + str(i))
-        enc_input = enc_output
-    enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
-    return enc_output
diff --git a/modules/text/language_model/chinese_bert_wwm_ext/module.py b/modules/text/language_model/chinese_bert_wwm_ext/module.py
index 273b2f02..6ff6803f 100644
--- a/modules/text/language_model/chinese_bert_wwm_ext/module.py
+++ b/modules/text/language_model/chinese_bert_wwm_ext/module.py
@@ -1,7 +1,6 @@
-# coding:utf-8
-# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -12,62 +11,121 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+from typing import Dict
 import os
+import math
 
-from paddlehub import TransformerModule
-from paddlehub.module.module import moduleinfo
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
 
-from chinese_bert_wwm_ext.model.bert import BertConfig, BertModel
+from paddlenlp.transformers.bert.modeling import BertForSequenceClassification, BertModel, BertForTokenClassification
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
+from paddlehub.utils.log import logger
 
 
 @moduleinfo(
     name="chinese-bert-wwm-ext",
-    version="1.0.0",
-    summary="chinese-bert-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters ",
+    version="2.0.1",
+    summary=
+    "chinese-bert-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.",
     author="ymcui",
     author_email="ymcui@ir.hit.edu.cn",
     type="nlp/semantic_model",
+    meta=TransformerModule
 )
-class BertWwm(TransformerModule):
-    def _initialize(self):
-        self.MAX_SEQ_LEN = 512
-        self.params_path = os.path.join(self.directory, "assets", "params")
-        self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+class BertWwm(nn.Layer):
+    """
+    BertWwm model
+    """
 
-        bert_config_path = os.path.join(self.directory, "assets", "bert_config.json")
-        self.bert_config = BertConfig(bert_config_path)
+    def __init__(
+            self,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
+    ):
+        super(BertWwm, self).__init__()
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
 
-    def net(self, input_ids, position_ids, segment_ids, input_mask):
-        """
-        create neural network.
+        if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = BertForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path='bert-wwm-ext-chinese',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = BertForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='bert-wwm-ext-chinese',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
+        elif task is None:
+            self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-ext-chinese', **kwargs)
+        else:
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
-        Args:
-            input_ids (tensor): the word ids.
-            position_ids (tensor): the position ids.
-            segment_ids (tensor): the segment ids.
-            input_mask (tensor): the padding mask.
+        self.task = task
 
-        Returns:
-            pooled_output (tensor):  sentence-level output for classification task.
-            sequence_output (tensor): token-level output for sequence task.
-        """
-        bert = BertModel(src_ids=input_ids,
-                         position_ids=position_ids,
-                         sentence_ids=segment_ids,
-                         input_mask=input_mask,
-                         config=self.bert_config,
-                         use_fp16=False)
-        pooled_output = bert.get_pooled_output()
-        sequence_output = bert.get_sequence_output()
-        return pooled_output, sequence_output
+        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
+            state_dict = paddle.load(load_checkpoint)
+            self.set_state_dict(state_dict)
+            logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
+        result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
+        if self.task == 'seq-cls':
+            logits = result
+            probs = F.softmax(logits, axis=1)
+            if labels is not None:
+                loss = self.criterion(logits, labels)
+                correct = self.metric.compute(probs, labels)
+                acc = self.metric.update(correct)
+                return probs, loss, {'acc': acc}
+            return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
+        else:
+            sequence_output, pooled_output = result
+            return sequence_output, pooled_output
 
-if __name__ == '__main__':
-    test_module = BertWwm()
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
+        """
+        Gets the tokenizer that is customized for this module.
+        """
+        return BertTokenizer.from_pretrained(
+            pretrained_model_name_or_path='bert-wwm-ext-chinese', *args, **kwargs)
diff --git a/modules/text/language_model/ernie/README.md b/modules/text/language_model/ernie/README.md
index 4aebcebf..4ee91755 100644
--- a/modules/text/language_model/ernie/README.md
+++ b/modules/text/language_model/ernie/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install ernie==2.0.0
+$ hub install ernie==2.0.1
 ```
 ## 在线体验
 <a class="ant-btn large" href="https://aistudio.baidu.com/aistudio/projectDetail/79380" target="_blank">AI Studio 快速体验</a>
@@ -15,7 +15,6 @@ $ hub install ernie==2.0.0
 </p>
 
 
-
 更多详情请参考[ERNIE论文](https://arxiv.org/abs/1904.09223)
 
 ## API
@@ -24,23 +23,29 @@ $ hub install ernie==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
@@ -55,7 +60,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -64,7 +71,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -78,16 +87,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='ernie',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -95,7 +104,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -121,12 +132,12 @@ $ hub serving start -m ernie
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/ernie"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -170,3 +181,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图版本，接口有所变化
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/ernie/module.py b/modules/text/language_model/ernie/module.py
index 752a33a4..37a99500 100644
--- a/modules/text/language_model/ernie/module.py
+++ b/modules/text/language_model/ernie/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification
+from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="ernie",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "Baidu's ERNIE, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class Ernie(nn.Layer):
     """
     Ernie model
@@ -41,180 +43,80 @@ class Ernie(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Ernie, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
-            self.model = ErnieForSequenceClassification.from_pretrained(pretrained_model_name_or_path='ernie')
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = ErnieForSequenceClassification.from_pretrained(pretrained_model_name_or_path='ernie-1.0', num_classes=self.num_classes, **kwargs)
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = ErnieForTokenClassification.from_pretrained(pretrained_model_name_or_path='ernie-1.0', num_classes=self.num_classes, **kwargs)
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie')
+            self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-1.0', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie/vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'ernie'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return ErnieTokenizer.from_pretrained(
+            pretrained_model_name_or_path='ernie-1.0', *args, **kwargs)
diff --git a/modules/text/language_model/ernie_tiny/README.md b/modules/text/language_model/ernie_tiny/README.md
index 899c500e..f02419a6 100644
--- a/modules/text/language_model/ernie_tiny/README.md
+++ b/modules/text/language_model/ernie_tiny/README.md
@@ -23,7 +23,10 @@ $ hub install ernie_tiny==2.0.1
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
@@ -33,13 +36,16 @@ def __init__(
 * `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
@@ -54,7 +60,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -63,7 +71,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -94,7 +104,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -120,12 +132,12 @@ $ hub serving start -m ernie_tiny
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/ernie_tiny"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
diff --git a/modules/text/language_model/ernie_tiny/module.py b/modules/text/language_model/ernie_tiny/module.py
index 77b954c8..d309ac47 100644
--- a/modules/text/language_model/ernie_tiny/module.py
+++ b/modules/text/language_model/ernie_tiny/module.py
@@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import ErnieTinyTokenizer
 from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification
+from paddlenlp.transformers.ernie.tokenizer import ErnieTinyTokenizer
+from paddlenlp.metrics import ChunkEvaluator
 from paddlehub.module.module import moduleinfo
 from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
@@ -41,14 +42,15 @@ class ErnieTiny(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
-            num_classes=2,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
             **kwargs,
     ):
         super(ErnieTiny, self).__init__()
         if label_map:
+            self.label_map = label_map
             self.num_classes = len(label_map)
         else:
             self.num_classes = num_classes
@@ -57,7 +59,7 @@ class ErnieTiny(nn.Layer):
             task = 'seq-cls'
             logger.warning(
                 "current task name 'sequence_classification' was renamed to 'seq-cls', "
-                "'sequence_classification' has been deprecated and will be removed the future.",
+                "'sequence_classification' has been deprecated and will be removed in the future.",
             )
         if task == 'seq-cls':
             self.model = ErnieForSequenceClassification.from_pretrained(pretrained_model_name_or_path='ernie-tiny', num_classes=self.num_classes, **kwargs)
@@ -66,7 +68,9 @@ class ErnieTiny(nn.Layer):
         elif task == 'token-cls':
             self.model = ErnieForTokenClassification.from_pretrained(pretrained_model_name_or_path='ernie-tiny', num_classes=self.num_classes, **kwargs)
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
             self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-tiny', **kwargs)
         else:
@@ -74,14 +78,13 @@ class ErnieTiny(nn.Layer):
                 task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
         if self.task == 'seq-cls':
             logits = result
@@ -90,49 +93,29 @@ class ErnieTiny(nn.Layer):
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
         elif self.task == 'token-cls':
             logits = result
-            token_level_probs = F.softmax(logits, axis=2)
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
             if labels is not None:
-                labels = paddle.to_tensor(labels).unsqueeze(-1)
-                loss = self.criterion(logits, labels)
-                correct = self.metric.compute(token_level_probs, labels)
-                acc = self.metric.update(correct)
-                return token_level_probs, loss, acc
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
             return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'ernie_tiny', 'vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'ernie_tiny'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
         """
-        spm_path = os.path.join(DATA_HOME, 'ernie_tiny', 'spm_cased_simp_sampled.model')
-        if not os.path.exists(spm_path) or not os.path.isfile(spm_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/spm_cased_simp_sampled.model"
-            download(url, os.path.join(DATA_HOME, 'ernie_tiny'))
-
-        word_dict_path = os.path.join(DATA_HOME, 'ernie_tiny', 'dict.wordseg.pickle')
-        if not os.path.exists(word_dict_path) or not os.path.isfile(word_dict_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/dict.wordseg.pickle"
-            download(url, os.path.join(DATA_HOME, 'ernie_tiny'))
-
-        return ErnieTinyTokenizer(self.get_vocab_path(), spm_path, word_dict_path)
+        return ErnieTinyTokenizer.from_pretrained(
+            pretrained_model_name_or_path='ernie-tiny', *args, **kwargs)
diff --git a/modules/text/language_model/ernie_v2_eng_base/README.md b/modules/text/language_model/ernie_v2_eng_base/README.md
index cd203a4a..d5ece7a9 100644
--- a/modules/text/language_model/ernie_v2_eng_base/README.md
+++ b/modules/text/language_model/ernie_v2_eng_base/README.md
@@ -1,6 +1,6 @@
 
 ```shell
-$ hub install ernie_v2_eng_base==2.0.0
+$ hub install ernie_v2_eng_base==2.0.1
 ```
 
 <p align="center">
@@ -19,23 +19,29 @@ $ hub install ernie_v2_eng_base==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
@@ -50,7 +56,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -59,7 +67,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -73,16 +83,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='ernie_v2_eng_base',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -90,7 +100,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -116,12 +128,12 @@ $ hub serving start -m ernie_v2_eng_base
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/ernie_v2_eng_base"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -157,3 +169,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图版本，接口有所变化
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/ernie_v2_eng_base/module.py b/modules/text/language_model/ernie_v2_eng_base/module.py
index b9b74ba3..59ea31b7 100644
--- a/modules/text/language_model/ernie_v2_eng_base/module.py
+++ b/modules/text/language_model/ernie_v2_eng_base/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification
+from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="ernie_v2_eng_base",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class ErnieV2(nn.Layer):
     """
     Ernie model
@@ -41,181 +43,88 @@ class ErnieV2(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(ErnieV2, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
             self.model = ErnieForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='ernie_v2_eng_base')
+                pretrained_model_name_or_path='ernie-2.0-en',
+                num_classes=self.num_classes,
+                **kwargs
+            )
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = ErnieForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='ernie-2.0-en',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie_v2_eng_base')
+            self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-en', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_eng_base/vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'ernie'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return ErnieTokenizer.from_pretrained(
+            pretrained_model_name_or_path='ernie-2.0-en', *args, **kwargs)
\ No newline at end of file
diff --git a/modules/text/language_model/ernie_v2_eng_large/README.md b/modules/text/language_model/ernie_v2_eng_large/README.md
index 149d6012..680bc1be 100644
--- a/modules/text/language_model/ernie_v2_eng_large/README.md
+++ b/modules/text/language_model/ernie_v2_eng_large/README.md
@@ -1,6 +1,6 @@
 
 ```shell
-$ hub install ernie_v2_eng_large==2.0.0
+$ hub install ernie_v2_eng_large==2.0.1
 ```
 
 <p align="center">
@@ -19,29 +19,35 @@ $ hub install ernie_v2_eng_large==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -50,7 +56,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -59,7 +67,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -73,16 +83,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='ernie_v2_eng_large',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -90,7 +100,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -116,12 +128,12 @@ $ hub serving start -m ernie_v2_eng_large
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/ernie_v2_eng_large"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -157,3 +169,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图版本，接口有所变化
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/ernie_v2_eng_large/module.py b/modules/text/language_model/ernie_v2_eng_large/module.py
index 8d3ae55f..0d54a670 100644
--- a/modules/text/language_model/ernie_v2_eng_large/module.py
+++ b/modules/text/language_model/ernie_v2_eng_large/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_ernie import ErnieModel, ErnieForSequenceClassification
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.ernie.modeling import ErnieModel, ErnieForSequenceClassification, ErnieForTokenClassification
+from paddlenlp.transformers.ernie.tokenizer import ErnieTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="ernie_v2_eng_large",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.",
     author="paddlepaddle",
     author_email="",
-    type="nlp/semantic_model")
+    type="nlp/semantic_model",
+    meta=TransformerModule)
 class ErnieV2(nn.Layer):
     """
     Ernie model
@@ -41,181 +43,88 @@ class ErnieV2(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(ErnieV2, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
             self.model = ErnieForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='ernie_v2_eng_large')
+                pretrained_model_name_or_path='ernie-2.0-large-en',
+                num_classes=self.num_classes,
+                **kwargs
+            )
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = ErnieForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='ernie-2.0-large-en',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie_v2_eng_large')
+            self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-large-en', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'ernie', 'vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_eng_large/vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'ernie'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return ErnieTokenizer.from_pretrained(
+            pretrained_model_name_or_path='ernie-2.0-large-en', *args, **kwargs)
diff --git a/modules/text/language_model/rbt3/README.md b/modules/text/language_model/rbt3/README.md
index a9a001d8..baf02dd1 100644
--- a/modules/text/language_model/rbt3/README.md
+++ b/modules/text/language_model/rbt3/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install rbt3==1.0.0
+$ hub install rtb3==2.0.1
 ```
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
@@ -9,94 +9,130 @@ $ hub install rbt3==1.0.0
 
 ## API
 ```python
-def context(
-    trainable=True,
-    max_seq_len=128
+def __init__(
+    task=None,
+    load_checkpoint=None,
+    label_map=None,
+    num_classes=2,
+    **kwargs,
 )
 ```
-用于获取Module的上下文信息，得到输入、输出以及预训练的Paddle Program副本  
 
-**参数**  
-
-> trainable：设置为True时，Module中的参数在Fine-tune时也会随之训练，否则保持不变。  
-> max_seq_len：BERT模型的最大序列长度，若序列长度不足，会通过padding方式补到**max_seq_len**, 若序列长度大于该值，则会以截断方式让序列长度为**max_seq_len**，max_seq_len可取值范围为0～512；  
-
-**返回**  
-> inputs：dict类型，有以下字段：  
-> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids， shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置，shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**segment_ids**存放各token所在文本的标识（token属于文本1或者文本2），shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**input_mask**存放token是否为padding的标识，shape为\[batch_size, max_seq_len\]，int64类型；  
->
-> outputs：dict类型，Module的输出特征，有以下字段：  
-> >**pooled_output**字段存放句子粒度的特征，可用于文本分类等任务，shape为 \[batch_size, 768\]，int64类型；  
-> >**sequence_output**字段存放字粒度的特征，可用于序列标注等任务，shape为 \[batch_size, seq_len, 768\]，int64类型；  
->
-> program：包含该Module计算图的Program。  
+创建Module对象（动态图组网版本）。
 
+**参数**
 
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
+* `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
+* `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
-def get_embedding(
-    texts,
-    use_gpu=False,
-    batch_size=1
+def predict(
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
 )
 ```
 
-用于获取输入文本的句子粒度特征与字粒度特征
-
 **参数**
 
-> texts：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
-> use_gpu：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
+* `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+* `max_seq_len`：模型处理文本的最大长度
+* `batch_size`：模型批处理大小
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
 
-> results：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
 ```python
-def get_params_layer()
+def get_embedding(
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
+)
 ```
 
-用于获取参数层信息，该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+用于获取输入文本的句子粒度特征与字粒度特征
 
 **参数**
 
-> 无
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
 
-> params_layer：dict类型，key为参数名，值为参数所在层数
+* `results`：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
+
 
 **代码示例**
 
 ```python
 import paddlehub as hub
 
-# Load $ hub install rbt3 pretrained model
-module = hub.Module(name="rbt3")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
+data = [
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
+]
+label_map = {0: 'negative', 1: 'positive'}
+
+model = hub.Module(
+    name='rtb3',
+    version='2.0.1',
+    task='seq-cls',
+    load_checkpoint='/path/to/parameters',
+    label_map=label_map)
+results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
+for idx, text in enumerate(data):
+    print('Data: {} \t Lable: {}'.format(text, results[idx]))
+```
 
-# Must feed all the tensor of rbt3's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
+## 服务部署
 
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
+PaddleHub Serving可以部署一个在线获取预训练词向量。
 
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
+### Step1: 启动PaddleHub Serving
 
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+运行启动命令：
+
+```shell
+$ hub serving start -m rtb3
+```
+
+这样就完成了一个获取预训练词向量服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
+url = "http://10.12.121.132:8866/predict/rtb3"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
 ```
 
 ## 查看代码
@@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm
 
 ## 依赖
 
-paddlepaddle >= 1.6.2
+paddlepaddle >= 2.0.0
 
-paddlehub >= 1.6.0
+paddlehub >= 2.0.0
 
 ## 更新历史
 
 * 1.0.0
 
   初始发布
+
+* 2.0.1
+
+  全面升级动态图，接口有所变化。任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/rbt3/model/__init__.py b/modules/text/language_model/rbt3/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/language_model/rbt3/model/bert.py b/modules/text/language_model/rbt3/model/bert.py
deleted file mode 100644
index 4d37cb02..00000000
--- a/modules/text/language_model/rbt3/model/bert.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-
-import paddle.fluid as fluid
-
-from rbt3.model.transformer_encoder import encoder, pre_process_layer
-
-
-class BertConfig(object):
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except Exception:
-            raise IOError("Error in parsing bert model config file '%s'" % config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class BertModel(object):
-    def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
-
-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self._weight_sharing = weight_sharing
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
-
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
-
-        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-
-    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
-        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(input=src_ids,
-                                         size=[self._voc_size, self._emb_size],
-                                         dtype=self._dtype,
-                                         param_attr=fluid.ParamAttr(name=self._word_emb_name,
-                                                                    initializer=self._param_initializer),
-                                         is_sparse=False)
-        position_emb_out = fluid.layers.embedding(input=position_ids,
-                                                  size=[self._max_position_seq_len, self._emb_size],
-                                                  dtype=self._dtype,
-                                                  param_attr=fluid.ParamAttr(name=self._pos_emb_name,
-                                                                             initializer=self._param_initializer))
-
-        sent_emb_out = fluid.layers.embedding(sentence_ids,
-                                              size=[self._sent_types, self._emb_size],
-                                              dtype=self._dtype,
-                                              param_attr=fluid.ParamAttr(name=self._sent_emb_name,
-                                                                         initializer=self._param_initializer))
-
-        emb_out = emb_out + position_emb_out
-        emb_out = emb_out + sent_emb_out
-
-        emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-
-        if self._dtype == "float16":
-            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-
-        self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-
-        self._enc_out = encoder(enc_input=emb_out,
-                                attn_bias=n_head_self_attn_mask,
-                                n_layer=self._n_layer,
-                                n_head=self._n_head,
-                                d_key=self._emb_size // self._n_head,
-                                d_value=self._emb_size // self._n_head,
-                                d_model=self._emb_size,
-                                d_inner_hid=self._emb_size * 4,
-                                prepostprocess_dropout=self._prepostprocess_dropout,
-                                attention_dropout=self._attention_dropout,
-                                relu_dropout=0,
-                                hidden_act=self._hidden_act,
-                                preprocess_cmd="",
-                                postprocess_cmd="dan",
-                                param_initializer=self._param_initializer,
-                                name='encoder')
-
-    def get_sequence_output(self):
-        return self._enc_out
-
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-
-        next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = fluid.layers.fc(input=next_sent_feat,
-                                         size=self._emb_size,
-                                         act="tanh",
-                                         param_attr=fluid.ParamAttr(name="pooled_fc.w_0",
-                                                                    initializer=self._param_initializer),
-                                         bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-
-    def get_pretraining_output(self, mask_label, mask_pos, labels):
-        """Get the loss & accuracy for pretraining"""
-
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
-        # extract the first token feature in each sentence
-        next_sent_feat = self.get_pooled_output()
-        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
-        # transform: fc
-        mask_trans_feat = fluid.layers.fc(input=mask_feat,
-                                          size=self._emb_size,
-                                          act=self._hidden_act,
-                                          param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
-                                                                     initializer=self._param_initializer),
-                                          bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-        # transform: layer norm
-        mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
-                                                initializer=fluid.initializer.Constant(value=0.0))
-        if self._weight_sharing:
-            fc_out = fluid.layers.matmul(x=mask_trans_feat,
-                                         y=fluid.default_main_program().global_block().var(self._word_emb_name),
-                                         transpose_y=True)
-            fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
-                                                    dtype=self._dtype,
-                                                    attr=mask_lm_out_bias_attr,
-                                                    is_bias=True)
-
-        else:
-            fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                     size=self._voc_size,
-                                     param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
-                                                                initializer=self._param_initializer),
-                                     bias_attr=mask_lm_out_bias_attr)
-
-        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-
-        next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
-                                           size=2,
-                                           param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
-                                                                      initializer=self._param_initializer),
-                                           bias_attr="next_sent_fc.b_0")
-
-        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
-                                                                                    label=labels,
-                                                                                    return_softmax=True)
-
-        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
-
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-
-        loss = mean_next_sent_loss + mean_mask_lm_loss
-        return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/modules/text/language_model/rbt3/model/transformer_encoder.py b/modules/text/language_model/rbt3/model/transformer_encoder.py
deleted file mode 100644
index b15d8388..00000000
--- a/modules/text/language_model/rbt3/model/transformer_encoder.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(weights,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(hidden,
-                                dropout_prob=dropout_rate,
-                                dropout_implementation="upscale_in_train",
-                                is_test=False)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float32")
-            out = layers.layer_norm(out,
-                                    begin_norm_axis=len(out.shape) - 1,
-                                    param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale',
-                                                               initializer=fluid.initializer.Constant(1.)),
-                                    bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias',
-                                                              initializer=fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(out,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(pre_process_layer(enc_input,
-                                                         preprocess_cmd,
-                                                         prepostprocess_dropout,
-                                                         name=name + '_pre_att'),
-                                       None,
-                                       None,
-                                       attn_bias,
-                                       d_key,
-                                       d_value,
-                                       d_model,
-                                       n_head,
-                                       attention_dropout,
-                                       param_initializer=param_initializer,
-                                       name=name + '_multi_head_att')
-    attn_output = post_process_layer(enc_input,
-                                     attn_output,
-                                     postprocess_cmd,
-                                     prepostprocess_dropout,
-                                     name=name + '_post_att')
-    ffd_output = positionwise_feed_forward(pre_process_layer(attn_output,
-                                                             preprocess_cmd,
-                                                             prepostprocess_dropout,
-                                                             name=name + '_pre_ffn'),
-                                           d_inner_hid,
-                                           d_model,
-                                           relu_dropout,
-                                           hidden_act,
-                                           param_initializer=param_initializer,
-                                           name=name + '_ffn')
-    return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn')
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name=''):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(enc_input,
-                                   attn_bias,
-                                   n_head,
-                                   d_key,
-                                   d_value,
-                                   d_model,
-                                   d_inner_hid,
-                                   prepostprocess_dropout,
-                                   attention_dropout,
-                                   relu_dropout,
-                                   hidden_act,
-                                   preprocess_cmd,
-                                   postprocess_cmd,
-                                   param_initializer=param_initializer,
-                                   name=name + '_layer_' + str(i))
-        enc_input = enc_output
-    enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
-    return enc_output
diff --git a/modules/text/language_model/rbt3/module.py b/modules/text/language_model/rbt3/module.py
index b35e0cd8..3833c987 100644
--- a/modules/text/language_model/rbt3/module.py
+++ b/modules/text/language_model/rbt3/module.py
@@ -1,7 +1,6 @@
-# coding:utf-8
-# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -12,62 +11,120 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+from typing import Dict
 import os
+import math
 
-from paddlehub import TransformerModule
-from paddlehub.module.module import moduleinfo
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
 
-from rbt3.model.bert import BertConfig, BertModel
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
+from paddlehub.utils.log import logger
 
 
 @moduleinfo(
     name="rbt3",
-    version="1.0.0",
+    version="2.0.1",
     summary="rbt3, 3-layer, 768-hidden, 12-heads, 38M parameters ",
     author="ymcui",
     author_email="ymcui@ir.hit.edu.cn",
     type="nlp/semantic_model",
+    meta=TransformerModule,
 )
-class BertWwm(TransformerModule):
-    def _initialize(self):
-        self.MAX_SEQ_LEN = 512
-        self.params_path = os.path.join(self.directory, "assets", "params")
-        self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+class Roberta(nn.Layer):
+    """
+    RoBERTa model
+    """
 
-        bert_config_path = os.path.join(self.directory, "assets", "bert_config_rbt3.json")
-        self.bert_config = BertConfig(bert_config_path)
+    def __init__(
+            self,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
+    ):
+        super(Roberta, self).__init__()
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
 
-    def net(self, input_ids, position_ids, segment_ids, input_mask):
-        """
-        create neural network.
+        if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = RobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path='rbt3',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = RobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='rbt3',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
+        elif task is None:
+            self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbt3', **kwargs)
+        else:
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
-        Args:
-            input_ids (tensor): the word ids.
-            position_ids (tensor): the position ids.
-            segment_ids (tensor): the segment ids.
-            input_mask (tensor): the padding mask.
+        self.task = task
 
-        Returns:
-            pooled_output (tensor):  sentence-level output for classification task.
-            sequence_output (tensor): token-level output for sequence task.
-        """
-        bert = BertModel(src_ids=input_ids,
-                         position_ids=position_ids,
-                         sentence_ids=segment_ids,
-                         input_mask=input_mask,
-                         config=self.bert_config,
-                         use_fp16=False)
-        pooled_output = bert.get_pooled_output()
-        sequence_output = bert.get_sequence_output()
-        return pooled_output, sequence_output
+        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
+            state_dict = paddle.load(load_checkpoint)
+            self.set_state_dict(state_dict)
+            logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
+        result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
+        if self.task == 'seq-cls':
+            logits = result
+            probs = F.softmax(logits, axis=1)
+            if labels is not None:
+                loss = self.criterion(logits, labels)
+                correct = self.metric.compute(probs, labels)
+                acc = self.metric.update(correct)
+                return probs, loss, {'acc': acc}
+            return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
+        else:
+            sequence_output, pooled_output = result
+            return sequence_output, pooled_output
 
-if __name__ == '__main__':
-    test_module = BertWwm()
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
+        """
+        Gets the tokenizer that is customized for this module.
+        """
+        return RobertaTokenizer.from_pretrained(
+            pretrained_model_name_or_path='rbt3', *args, **kwargs)
diff --git a/modules/text/language_model/rbtl3/README.md b/modules/text/language_model/rbtl3/README.md
index 53107271..f1dd9c43 100644
--- a/modules/text/language_model/rbtl3/README.md
+++ b/modules/text/language_model/rbtl3/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install rbtl3==1.0.0
+$ hub install rbtl3==2.0.1
 ```
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
@@ -9,94 +9,130 @@ $ hub install rbtl3==1.0.0
 
 ## API
 ```python
-def context(
-    trainable=True,
-    max_seq_len=128
+def __init__(
+    task=None,
+    load_checkpoint=None,
+    label_map=None,
+    num_classes=2,
+    **kwargs,
 )
 ```
-用于获取Module的上下文信息，得到输入、输出以及预训练的Paddle Program副本  
 
-**参数**  
-
-> trainable：设置为True时，Module中的参数在Fine-tune时也会随之训练，否则保持不变。  
-> max_seq_len：BERT模型的最大序列长度，若序列长度不足，会通过padding方式补到**max_seq_len**, 若序列长度大于该值，则会以截断方式让序列长度为**max_seq_len**，max_seq_len可取值范围为0～512；  
-
-**返回**  
-> inputs：dict类型，有以下字段：  
-> >**input_ids**存放输入文本tokenize后各token对应BERT词汇表的word ids， shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**position_ids**存放输入文本tokenize后各token所在该文本的位置，shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**segment_ids**存放各token所在文本的标识（token属于文本1或者文本2），shape为\[batch_size, max_seq_len\]，int64类型；  
-> >**input_mask**存放token是否为padding的标识，shape为\[batch_size, max_seq_len\]，int64类型；  
->
-> outputs：dict类型，Module的输出特征，有以下字段：  
-> >**pooled_output**字段存放句子粒度的特征，可用于文本分类等任务，shape为 \[batch_size, 768\]，int64类型；  
-> >**sequence_output**字段存放字粒度的特征，可用于序列标注等任务，shape为 \[batch_size, seq_len, 768\]，int64类型；  
->
-> program：包含该Module计算图的Program。  
+创建Module对象（动态图组网版本）。
 
+**参数**
 
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
+* `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
+* `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
-def get_embedding(
-    texts,
-    use_gpu=False,
-    batch_size=1
+def predict(
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
 )
 ```
 
-用于获取输入文本的句子粒度特征与字粒度特征
-
 **参数**
 
-> texts：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
-> use_gpu：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
+* `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+* `max_seq_len`：模型处理文本的最大长度
+* `batch_size`：模型批处理大小
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
 
-> results：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
->
-
 ```python
-def get_params_layer()
+def get_embedding(
+    data,
+    max_seq_len=128,
+    batch_size=1,
+    use_gpu=False
+)
 ```
 
-用于获取参数层信息，该方法与ULMFiTStrategy联用可以严格按照层数设置分层学习率与逐层解冻。
+用于获取输入文本的句子粒度特征与字粒度特征
 
 **参数**
 
-> 无
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
+* `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
 
-> params_layer：dict类型，key为参数名，值为参数所在层数
+* `results`：list类型，格式为\[\[sample\_a\_pooled\_feature, sample\_a\_seq\_feature\], \[sample\_b\_pooled\_feature, sample\_b\_seq\_feature\],…,\]，其中每个元素都是对应样例的特征输出，每个样例都有句子粒度特征pooled\_feature与字粒度特征seq\_feature。
+
 
 **代码示例**
 
 ```python
 import paddlehub as hub
 
-# Load $ hub install rbtl3 pretrained model
-module = hub.Module(name="rbtl3")
-inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
+data = [
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
+]
+label_map = {0: 'negative', 1: 'positive'}
+
+model = hub.Module(
+    name='rbtl3',
+    version='2.0.1',
+    task='seq-cls',
+    load_checkpoint='/path/to/parameters',
+    label_map=label_map)
+results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
+for idx, text in enumerate(data):
+    print('Data: {} \t Lable: {}'.format(text, results[idx]))
+```
 
-# Must feed all the tensor of rbtl3's module need
-input_ids = inputs["input_ids"]
-position_ids = inputs["position_ids"]
-segment_ids = inputs["segment_ids"]
-input_mask = inputs["input_mask"]
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
-# Use "pooled_output" for sentence-level output.
-pooled_output = outputs["pooled_output"]
+## 服务部署
 
-# Use "sequence_output" for token-level output.
-sequence_output = outputs["sequence_output"]
+PaddleHub Serving可以部署一个在线获取预训练词向量。
 
-# Use "get_embedding" to get embedding result.
-embedding_result = module.get_embedding(texts=[["Sample1_text_a"],["Sample2_text_a","Sample2_text_b"]], use_gpu=True)
+### Step1: 启动PaddleHub Serving
 
-# Use "get_params_layer" to get params layer and used to ULMFiTStrategy.
-params_layer = module.get_params_layer()
-strategy = hub.finetune.strategy.ULMFiTStrategy(frz_params_layer=params_layer, dis_params_layer=params_layer)
+运行启动命令：
+
+```shell
+$ hub serving start -m rbtl3
+```
+
+这样就完成了一个获取预训练词向量服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
+url = "http://10.12.121.132:8866/predict/rbtl3"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
 ```
 
 ## 查看代码
@@ -109,12 +145,16 @@ https://github.com/ymcui/Chinese-BERT-wwm
 
 ## 依赖
 
-paddlepaddle >= 1.6.2
+paddlepaddle >= 2.0.0
 
-paddlehub >= 1.6.0
+paddlehub >= 2.0.0
 
 ## 更新历史
 
 * 1.0.0
 
   初始发布
+
+* 2.0.1
+
+  全面升级动态图，接口有所变化。任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/rbtl3/model/__init__.py b/modules/text/language_model/rbtl3/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/language_model/rbtl3/model/bert.py b/modules/text/language_model/rbtl3/model/bert.py
deleted file mode 100644
index 8c27ad34..00000000
--- a/modules/text/language_model/rbtl3/model/bert.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import json
-
-import paddle.fluid as fluid
-
-from rbtl3.model.transformer_encoder import encoder, pre_process_layer
-
-
-class BertConfig(object):
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except Exception:
-            raise IOError("Error in parsing bert model config file '%s'" % config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class BertModel(object):
-    def __init__(self, src_ids, position_ids, sentence_ids, input_mask, config, weight_sharing=True, use_fp16=False):
-
-        self._emb_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self._weight_sharing = weight_sharing
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
-
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(scale=config['initializer_range'])
-
-        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
-
-    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
-        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(input=src_ids,
-                                         size=[self._voc_size, self._emb_size],
-                                         dtype=self._dtype,
-                                         param_attr=fluid.ParamAttr(name=self._word_emb_name,
-                                                                    initializer=self._param_initializer),
-                                         is_sparse=False)
-        position_emb_out = fluid.layers.embedding(input=position_ids,
-                                                  size=[self._max_position_seq_len, self._emb_size],
-                                                  dtype=self._dtype,
-                                                  param_attr=fluid.ParamAttr(name=self._pos_emb_name,
-                                                                             initializer=self._param_initializer))
-
-        sent_emb_out = fluid.layers.embedding(sentence_ids,
-                                              size=[self._sent_types, self._emb_size],
-                                              dtype=self._dtype,
-                                              param_attr=fluid.ParamAttr(name=self._sent_emb_name,
-                                                                         initializer=self._param_initializer))
-
-        emb_out = emb_out + position_emb_out
-        emb_out = emb_out + sent_emb_out
-
-        emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-
-        if self._dtype == "float16":
-            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
-
-        self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True)
-        self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
-        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
-        n_head_self_attn_mask.stop_gradient = True
-
-        self._enc_out = encoder(enc_input=emb_out,
-                                attn_bias=n_head_self_attn_mask,
-                                n_layer=self._n_layer,
-                                n_head=self._n_head,
-                                d_key=self._emb_size // self._n_head,
-                                d_value=self._emb_size // self._n_head,
-                                d_model=self._emb_size,
-                                d_inner_hid=self._emb_size * 4,
-                                prepostprocess_dropout=self._prepostprocess_dropout,
-                                attention_dropout=self._attention_dropout,
-                                relu_dropout=0,
-                                hidden_act=self._hidden_act,
-                                preprocess_cmd="",
-                                postprocess_cmd="dan",
-                                param_initializer=self._param_initializer,
-                                name='encoder')
-
-    def get_sequence_output(self):
-        return self._enc_out
-
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-
-        next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
-        next_sent_feat = fluid.layers.fc(input=next_sent_feat,
-                                         size=self._emb_size,
-                                         act="tanh",
-                                         param_attr=fluid.ParamAttr(name="pooled_fc.w_0",
-                                                                    initializer=self._param_initializer),
-                                         bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-
-    def get_pretraining_output(self, mask_label, mask_pos, labels):
-        """Get the loss & accuracy for pretraining"""
-
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
-        # extract the first token feature in each sentence
-        next_sent_feat = self.get_pooled_output()
-        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-
-        # transform: fc
-        mask_trans_feat = fluid.layers.fc(input=mask_feat,
-                                          size=self._emb_size,
-                                          act=self._hidden_act,
-                                          param_attr=fluid.ParamAttr(name='mask_lm_trans_fc.w_0',
-                                                                     initializer=self._param_initializer),
-                                          bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-        # transform: layer norm
-        mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name='mask_lm_trans')
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0",
-                                                initializer=fluid.initializer.Constant(value=0.0))
-        if self._weight_sharing:
-            fc_out = fluid.layers.matmul(x=mask_trans_feat,
-                                         y=fluid.default_main_program().global_block().var(self._word_emb_name),
-                                         transpose_y=True)
-            fc_out += fluid.layers.create_parameter(shape=[self._voc_size],
-                                                    dtype=self._dtype,
-                                                    attr=mask_lm_out_bias_attr,
-                                                    is_bias=True)
-
-        else:
-            fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                     size=self._voc_size,
-                                     param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0",
-                                                                initializer=self._param_initializer),
-                                     bias_attr=mask_lm_out_bias_attr)
-
-        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-
-        next_sent_fc_out = fluid.layers.fc(input=next_sent_feat,
-                                           size=2,
-                                           param_attr=fluid.ParamAttr(name="next_sent_fc.w_0",
-                                                                      initializer=self._param_initializer),
-                                           bias_attr="next_sent_fc.b_0")
-
-        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(logits=next_sent_fc_out,
-                                                                                    label=labels,
-                                                                                    return_softmax=True)
-
-        next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels)
-
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
-
-        loss = mean_next_sent_loss + mean_mask_lm_loss
-        return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/modules/text/language_model/rbtl3/model/transformer_encoder.py b/modules/text/language_model/rbtl3/model/transformer_encoder.py
deleted file mode 100644
index b15d8388..00000000
--- a/modules/text/language_model/rbtl3/model/transformer_encoder.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_query_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_key_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(name=name + '_value_fc.w_0', initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(weights,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(name=name + '_output_fc.w_0', initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(name=name + '_fc_0.w_0', initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(hidden,
-                                dropout_prob=dropout_rate,
-                                dropout_implementation="upscale_in_train",
-                                is_test=False)
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float32")
-            out = layers.layer_norm(out,
-                                    begin_norm_axis=len(out.shape) - 1,
-                                    param_attr=fluid.ParamAttr(name=name + '_layer_norm_scale',
-                                                               initializer=fluid.initializer.Constant(1.)),
-                                    bias_attr=fluid.ParamAttr(name=name + '_layer_norm_bias',
-                                                              initializer=fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x=out, dtype="float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(out,
-                                     dropout_prob=dropout_rate,
-                                     dropout_implementation="upscale_in_train",
-                                     is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(pre_process_layer(enc_input,
-                                                         preprocess_cmd,
-                                                         prepostprocess_dropout,
-                                                         name=name + '_pre_att'),
-                                       None,
-                                       None,
-                                       attn_bias,
-                                       d_key,
-                                       d_value,
-                                       d_model,
-                                       n_head,
-                                       attention_dropout,
-                                       param_initializer=param_initializer,
-                                       name=name + '_multi_head_att')
-    attn_output = post_process_layer(enc_input,
-                                     attn_output,
-                                     postprocess_cmd,
-                                     prepostprocess_dropout,
-                                     name=name + '_post_att')
-    ffd_output = positionwise_feed_forward(pre_process_layer(attn_output,
-                                                             preprocess_cmd,
-                                                             prepostprocess_dropout,
-                                                             name=name + '_pre_ffn'),
-                                           d_inner_hid,
-                                           d_model,
-                                           relu_dropout,
-                                           hidden_act,
-                                           param_initializer=param_initializer,
-                                           name=name + '_ffn')
-    return post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + '_post_ffn')
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name=''):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(enc_input,
-                                   attn_bias,
-                                   n_head,
-                                   d_key,
-                                   d_value,
-                                   d_model,
-                                   d_inner_hid,
-                                   prepostprocess_dropout,
-                                   attention_dropout,
-                                   relu_dropout,
-                                   hidden_act,
-                                   preprocess_cmd,
-                                   postprocess_cmd,
-                                   param_initializer=param_initializer,
-                                   name=name + '_layer_' + str(i))
-        enc_input = enc_output
-    enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-
-    return enc_output
diff --git a/modules/text/language_model/rbtl3/module.py b/modules/text/language_model/rbtl3/module.py
index a60c30a4..500fc42c 100644
--- a/modules/text/language_model/rbtl3/module.py
+++ b/modules/text/language_model/rbtl3/module.py
@@ -1,7 +1,6 @@
-# coding:utf-8
-# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -12,62 +11,120 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+from typing import Dict
 import os
+import math
 
-from paddlehub import TransformerModule
-from paddlehub.module.module import moduleinfo
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
 
-from rbtl3.model.bert import BertConfig, BertModel
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
+from paddlehub.utils.log import logger
 
 
 @moduleinfo(
     name="rbtl3",
-    version="1.0.0",
+    version="2.0.1",
     summary="rbtl3, 3-layer, 1024-hidden, 16-heads, 61M parameters ",
     author="ymcui",
     author_email="ymcui@ir.hit.edu.cn",
     type="nlp/semantic_model",
+    meta=TransformerModule,
 )
-class BertWwm(TransformerModule):
-    def _initialize(self):
-        self.MAX_SEQ_LEN = 512
-        self.params_path = os.path.join(self.directory, "assets", "params")
-        self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+class Roberta(nn.Layer):
+    """
+    RoBERTa model
+    """
 
-        bert_config_path = os.path.join(self.directory, "assets", "bert_config_rbtl3.json")
-        self.bert_config = BertConfig(bert_config_path)
+    def __init__(
+            self,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
+    ):
+        super(Roberta, self).__init__()
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
 
-    def net(self, input_ids, position_ids, segment_ids, input_mask):
-        """
-        create neural network.
+        if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
+            self.model = RobertaForSequenceClassification.from_pretrained(
+                pretrained_model_name_or_path='rbtl3',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = RobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='rbtl3',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
+        elif task is None:
+            self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbtl3', **kwargs)
+        else:
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
-        Args:
-            input_ids (tensor): the word ids.
-            position_ids (tensor): the position ids.
-            segment_ids (tensor): the segment ids.
-            input_mask (tensor): the padding mask.
+        self.task = task
 
-        Returns:
-            pooled_output (tensor):  sentence-level output for classification task.
-            sequence_output (tensor): token-level output for sequence task.
-        """
-        bert = BertModel(src_ids=input_ids,
-                         position_ids=position_ids,
-                         sentence_ids=segment_ids,
-                         input_mask=input_mask,
-                         config=self.bert_config,
-                         use_fp16=False)
-        pooled_output = bert.get_pooled_output()
-        sequence_output = bert.get_sequence_output()
-        return pooled_output, sequence_output
+        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
+            state_dict = paddle.load(load_checkpoint)
+            self.set_state_dict(state_dict)
+            logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
+        result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
+        if self.task == 'seq-cls':
+            logits = result
+            probs = F.softmax(logits, axis=1)
+            if labels is not None:
+                loss = self.criterion(logits, labels)
+                correct = self.metric.compute(probs, labels)
+                acc = self.metric.update(correct)
+                return probs, loss, {'acc': acc}
+            return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
+        else:
+            sequence_output, pooled_output = result
+            return sequence_output, pooled_output
 
-if __name__ == '__main__':
-    test_module = BertWwm()
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
+        """
+        Gets the tokenizer that is customized for this module.
+        """
+        return RobertaTokenizer.from_pretrained(
+            pretrained_model_name_or_path='rbtl3', *args, **kwargs)
diff --git a/modules/text/language_model/roberta-wwm-ext-large/README.md b/modules/text/language_model/roberta-wwm-ext-large/README.md
index 77d1b02c..4d19bf2b 100644
--- a/modules/text/language_model/roberta-wwm-ext-large/README.md
+++ b/modules/text/language_model/roberta-wwm-ext-large/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install roberta-wwm-ext-large==2.0.0
+$ hub install roberta-wwm-ext-large==2.0.1
 ```
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
@@ -13,29 +13,35 @@ $ hub install roberta-wwm-ext-large==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +50,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +61,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +77,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='roberta-wwm-ext-large',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +94,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +122,12 @@ $ hub serving start -m roberta-wwm-ext-large
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/roberta-wwm-ext-large"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -126,7 +138,7 @@ print(r.json())
 
 ##   查看代码
 
-https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/pretrain_langauge_models/BERT
+https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/paddlenlp/transformers/roberta
 
 
 ## 依赖
@@ -144,3 +156,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/roberta-wwm-ext-large/module.py b/modules/text/language_model/roberta-wwm-ext-large/module.py
index 7785bf1e..aa45811d 100644
--- a/modules/text/language_model/roberta-wwm-ext-large/module.py
+++ b/modules/text/language_model/roberta-wwm-ext-large/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_roberta import RobertaModel, RobertaForSequenceClassification
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="roberta-wwm-ext-large",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "chinese-roberta-wwm-ext-large, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.",
     author="ymcui",
     author_email="ymcui@ir.hit.edu.cn",
     type="nlp/semantic_model",
+    meta=TransformerModule,
 )
 class Roberta(nn.Layer):
     """
@@ -42,181 +44,88 @@ class Roberta(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Roberta, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
             self.model = RobertaForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='roberta-wwm-ext-large')
+                pretrained_model_name_or_path='roberta-wwm-ext-large',
+                num_classes=self.num_classes,
+                **kwargs
+            )
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = RobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='roberta-wwm-ext-large',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large')
+            self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'roberta-wwm-ext-large', 'vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'roberta-wwm-ext-large'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(str)`): The processed data whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return RobertaTokenizer.from_pretrained(
+            pretrained_model_name_or_path='roberta-wwm-ext-large', *args, **kwargs)
diff --git a/modules/text/language_model/roberta-wwm-ext/README.md b/modules/text/language_model/roberta-wwm-ext/README.md
index 8f4eeb80..9ee71b85 100644
--- a/modules/text/language_model/roberta-wwm-ext/README.md
+++ b/modules/text/language_model/roberta-wwm-ext/README.md
@@ -1,5 +1,5 @@
 ```shell
-$ hub install roberta-wwm-ext==2.0.0
+$ hub install roberta-wwm-ext==2.0.1
 ```
 <p align="center">
 <img src="https://bj.bcebos.com/paddlehub/paddlehub-img/bert_network.png"  hspace='10'/> <br />
@@ -13,29 +13,35 @@ $ hub install roberta-wwm-ext==2.0.0
 def __init__(
     task=None,
     load_checkpoint=None,
-    label_map=None)
+    label_map=None,
+    num_classes=2,
+    **kwargs,
+)
 ```
 
 创建Module对象（动态图组网版本）。
 
 **参数**
 
-* `task`： 任务名称，可为`sequence_classification`。
+* `task`： 任务名称，可为`seq-cls`(文本分类任务，原来的`sequence_classification`在未来会被弃用)或`token-cls`(序列标注任务)。
 * `load_checkpoint`：使用PaddleHub Fine-tune api训练保存的模型参数文件路径。
 * `label_map`：预测时的类别映射表。
+* `num_classes`：分类任务的类别数，如果指定了`label_map`，此参数可不传，默认2分类。
+* `**kwargs`：用户额外指定的关键字字典类型的参数。
 
 ```python
 def predict(
     data,
     max_seq_len=128,
     batch_size=1,
-    use_gpu=False)
+    use_gpu=False
+)
 ```
 
 **参数**
 
 * `data`： 待预测数据，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，
-    每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
+  每个样例可以包含text\_a与text\_b。每个样例文本数量（1个或者2个）需和训练时保持一致。
 * `max_seq_len`：模型处理文本的最大长度
 * `batch_size`：模型批处理大小
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
@@ -44,7 +50,9 @@ def predict(
 
 ```python
 def get_embedding(
-    texts,
+    data,
+    max_seq_len=128,
+    batch_size=1,
     use_gpu=False
 )
 ```
@@ -53,7 +61,9 @@ def get_embedding(
 
 **参数**
 
-* `texts`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `data`：输入文本列表，格式为\[\[sample\_a\_text\_a, sample\_a\_text\_b\], \[sample\_b\_text\_a, sample\_b\_text\_b\],…,\]，其中每个元素都是一个样例，每个样例可以包含text\_a与text\_b。
+* `max_seq_len`：模型处理文本的最大长度。
+* `batch_size`：模型批处理大小。
 * `use_gpu`：是否使用gpu，默认为False。对于GPU用户，建议开启use_gpu。
 
 **返回**
@@ -67,16 +77,16 @@ def get_embedding(
 import paddlehub as hub
 
 data = [
-    '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
-    '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
-    '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
+    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般'],
+    ['怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片'],
+    ['作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。'],
 ]
 label_map = {0: 'negative', 1: 'positive'}
 
 model = hub.Module(
     name='roberta-wwm-ext',
-    version='2.0.0',
-    task='sequence_classification',
+    version='2.0.1',
+    task='seq-cls',
     load_checkpoint='/path/to/parameters',
     label_map=label_map)
 results = model.predict(data, max_seq_len=50, batch_size=1, use_gpu=False)
@@ -84,7 +94,9 @@ for idx, text in enumerate(data):
     print('Data: {} \t Lable: {}'.format(text, results[idx]))
 ```
 
-参考PaddleHub 文本分类示例。https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classifcation
+详情可参考PaddleHub示例：
+- [文本分类](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/text_classification)
+- [序列标注](https://github.com/PaddlePaddle/PaddleHub/tree/release/v2.0.0-beta/demo/sequence_labeling)
 
 ## 服务部署
 
@@ -110,12 +122,12 @@ $ hub serving start -m roberta-wwm-ext
 import requests
 import json
 
-# 指定用于预测的文本并生成字典{"text": [text_1, text_2, ... ]}
-text = [["今天是个好日子", "天气预报说今天要下雨"], ["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"]]
-# 以key的方式指定text传入预测方法的时的参数，此例中为"texts"
-# 对应本地部署，则为module.get_embedding(texts=text)
-data = {"texts": text}
-# 发送post请求，content-type类型应指定json方式
+# 指定用于获取embedding的文本[[text_1], [text_2], ... ]}
+text = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+# 以key的方式指定text传入预测方法的时的参数，此例中为"data"
+# 对应本地部署，则为module.get_embedding(data=text)
+data = {"data": text}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
 url = "http://10.12.121.132:8866/predict/roberta-wwm-ext"
 # 指定post请求的headers为application/json方式
 headers = {"Content-Type": "application/json"}
@@ -126,7 +138,7 @@ print(r.json())
 
 ##   查看代码
 
-https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/pretrain_langauge_models/BERT
+https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/paddlenlp/transformers/roberta
 
 
 ## 依赖
@@ -144,3 +156,7 @@ paddlehub >= 2.0.0
 * 2.0.0
 
   全面升级动态图，接口有所变化。
+
+* 2.0.1
+
+  任务名称调整，增加序列标注任务`token-cls`
diff --git a/modules/text/language_model/roberta-wwm-ext/module.py b/modules/text/language_model/roberta-wwm-ext/module.py
index a4df8146..8fa2bbe7 100644
--- a/modules/text/language_model/roberta-wwm-ext/module.py
+++ b/modules/text/language_model/roberta-wwm-ext/module.py
@@ -11,29 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict
 import os
+import math
 
-from paddle.dataset.common import DATA_HOME
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlehub import BertTokenizer
-from paddlehub.module.modeling_roberta import RobertaModel, RobertaForSequenceClassification
-from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.transformers.roberta.modeling import RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel
+from paddlenlp.transformers.roberta.tokenizer import RobertaTokenizer
+from paddlenlp.metrics import ChunkEvaluator
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.nlp_module import TransformerModule
 from paddlehub.utils.log import logger
-from paddlehub.utils.utils import download
 
 
 @moduleinfo(
     name="roberta-wwm-ext",
-    version="2.0.0",
+    version="2.0.1",
     summary=
     "chinese-roberta-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters.  The module is executed as paddle.dygraph.",
     author="ymcui",
     author_email="ymcui@ir.hit.edu.cn",
     type="nlp/semantic_model",
+    meta=TransformerModule,
 )
 class Roberta(nn.Layer):
     """
@@ -42,181 +44,88 @@ class Roberta(nn.Layer):
 
     def __init__(
             self,
-            task=None,
-            load_checkpoint=None,
-            label_map=None,
+            task: str = None,
+            load_checkpoint: str = None,
+            label_map: Dict = None,
+            num_classes: int = 2,
+            **kwargs,
     ):
         super(Roberta, self).__init__()
-        # TODO(zhangxuefei): add token_classification task
+        if label_map:
+            self.label_map = label_map
+            self.num_classes = len(label_map)
+        else:
+            self.num_classes = num_classes
+
         if task == 'sequence_classification':
+            task = 'seq-cls'
+            logger.warning(
+                "current task name 'sequence_classification' was renamed to 'seq-cls', "
+                "'sequence_classification' has been deprecated and will be removed in the future.",
+            )
+        if task == 'seq-cls':
             self.model = RobertaForSequenceClassification.from_pretrained(
-                pretrained_model_name_or_path='roberta-wwm-ext')
+                pretrained_model_name_or_path='roberta-wwm-ext',
+                num_classes=self.num_classes,
+                **kwargs
+            )
             self.criterion = paddle.nn.loss.CrossEntropyLoss()
-            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
+            self.metric = paddle.metric.Accuracy()
+        elif task == 'token-cls':
+            self.model = RobertaForTokenClassification.from_pretrained(
+                pretrained_model_name_or_path='roberta-wwm-ext',
+                num_classes=self.num_classes,
+                **kwargs
+            )
+            self.criterion = paddle.nn.loss.CrossEntropyLoss()
+            self.metric = ChunkEvaluator(
+                label_list=[self.label_map[i] for i in sorted(self.label_map.keys())]
+            )
         elif task is None:
-            self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext')
+            self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext', **kwargs)
         else:
-            raise RuntimeError("Unknown task %s, task should be sequence_classification" % task)
+            raise RuntimeError("Unknown task {}, task should be one in {}".format(
+                task, self._tasks_supported))
 
         self.task = task
-        self.label_map = label_map
 
         if load_checkpoint is not None and os.path.isfile(load_checkpoint):
             state_dict = paddle.load(load_checkpoint)
             self.set_state_dict(state_dict)
             logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None):
         result = self.model(input_ids, token_type_ids, position_ids, attention_mask)
-        if self.task is not None:
+        if self.task == 'seq-cls':
             logits = result
             probs = F.softmax(logits, axis=1)
             if labels is not None:
                 loss = self.criterion(logits, labels)
                 correct = self.metric.compute(probs, labels)
                 acc = self.metric.update(correct)
-                return probs, loss, acc
+                return probs, loss, {'acc': acc}
             return probs
+        elif self.task == 'token-cls':
+            logits = result
+            token_level_probs = F.softmax(logits, axis=-1)
+            preds = token_level_probs.argmax(axis=-1)
+            if labels is not None:
+                loss = self.criterion(logits, labels.unsqueeze(-1))
+                num_infer_chunks, num_label_chunks, num_correct_chunks = \
+                    self.metric.compute(None, seq_lengths, preds, labels)
+                self.metric.update(
+                    num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+                _, _, f1_score = map(float, self.metric.accumulate())
+                return token_level_probs, loss, {'f1_score': f1_score}
+            return token_level_probs
         else:
             sequence_output, pooled_output = result
             return sequence_output, pooled_output
 
-    def get_vocab_path(self):
-        """
-        Gets the path of the module vocabulary path.
-        """
-        save_path = os.path.join(DATA_HOME, 'roberta-wwm-ext', 'vocab.txt')
-        if not os.path.exists(save_path) or not os.path.isfile(save_path):
-            url = "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/vocab.txt"
-            download(url, os.path.join(DATA_HOME, 'roberta-wwm-ext'))
-        return save_path
-
-    def get_tokenizer(self, tokenize_chinese_chars=True):
+    @staticmethod
+    def get_tokenizer(*args, **kwargs):
         """
         Gets the tokenizer that is customized for this module.
-        Args:
-            tokenize_chinese_chars (:obj: bool , defaults to :obj: True):
-                Whether to tokenize chinese characters or not.
-        Returns:
-            tokenizer (:obj:BertTokenizer) : The tokenizer which was customized for this module.
-        """
-        return BertTokenizer(tokenize_chinese_chars=tokenize_chinese_chars, vocab_file=self.get_vocab_path())
-
-    def training_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for training, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as loss and metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
-
-    def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
-        """
-        One step for validation, which should be called as forward computation.
-        Args:
-            batch(:obj:List[paddle.Tensor]): The one batch data, which contains the model needed,
-                such as input_ids, sent_ids, pos_ids, input_mask and labels.
-            batch_idx(int): The index of batch.
-        Returns:
-            results(:obj: Dict) : The model outputs, such as metrics.
-        """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
-
-    def predict(self, data, max_seq_len=128, batch_size=1, use_gpu=False):
         """
-        Predicts the data labels.
-
-        Args:
-            data (obj:`List(Union(str))`): The processed data (the one sequence or sequence pair) whose each element is the raw text.
-            max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-            batch_size(obj:`int`, defaults to 1): The number of batch.
-            use_gpu(obj:`bool`, defaults to `False`): Whether to use gpu to run or not.
-
-        Returns:
-            results(obj:`list`): All the predictions labels.
-        """
-        # TODO(zhangxuefei): add task token_classification task predict.
-        if self.task not in ['sequence_classification']:
-            raise RuntimeError("The predict method is for sequence_classification task, but got task %s." % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-        tokenizer = self.get_tokenizer()
-
-        examples = []
-        for text in data:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, max_seq_len=max_seq_len)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], max_seq_len=max_seq_len)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-            examples.append((encoded_inputs['input_ids'], encoded_inputs['segment_ids']))
-
-        def _batchify_fn(batch):
-            input_ids = [entry[0] for entry in batch]
-            segment_ids = [entry[1] for entry in batch]
-            return input_ids, segment_ids
-
-        # Seperates data into some batches.
-        batches = []
-        one_batch = []
-        for example in examples:
-            one_batch.append(example)
-            if len(one_batch) == batch_size:
-                batches.append(one_batch)
-                one_batch = []
-        if one_batch:
-            # The last batch whose size is less than the config batch_size setting.
-            batches.append(one_batch)
-
-        results = []
-        self.eval()
-        for batch in batches:
-            input_ids, segment_ids = _batchify_fn(batch)
-            input_ids = paddle.to_tensor(input_ids)
-            segment_ids = paddle.to_tensor(segment_ids)
-
-            # TODO(zhangxuefei): add task token_classification postprocess after prediction.
-            if self.task == 'sequence_classification':
-                probs = self(input_ids, segment_ids)
-                idx = paddle.argmax(probs, axis=1).numpy()
-                idx = idx.tolist()
-                labels = [self.label_map[i] for i in idx]
-                results.extend(labels)
-
-        return results
-
-    @serving
-    def get_embedding(self, texts, use_gpu=False):
-        if self.task is not None:
-            raise RuntimeError("The get_embedding method is only valid when task is None, but got task %s" % self.task)
-
-        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
-
-        tokenizer = self.get_tokenizer()
-        results = []
-        for text in texts:
-            if len(text) == 1:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=None, pad_to_max_seq_len=False)
-            elif len(text) == 2:
-                encoded_inputs = tokenizer.encode(text[0], text_pair=text[1], pad_to_max_seq_len=False)
-            else:
-                raise RuntimeError(
-                    'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text))
-
-            input_ids = paddle.to_tensor(encoded_inputs['input_ids']).unsqueeze(0)
-            segment_ids = paddle.to_tensor(encoded_inputs['segment_ids']).unsqueeze(0)
-            sequence_output, pooled_output = self(input_ids, segment_ids)
-
-            sequence_output = sequence_output.squeeze(0)
-            pooled_output = pooled_output.squeeze(0)
-            results.append((sequence_output.numpy().tolist(), pooled_output.numpy().tolist()))
-        return results
+        return RobertaTokenizer.from_pretrained(
+            pretrained_model_name_or_path='roberta-wwm-ext', *args, **kwargs)
diff --git a/paddlehub/datasets/base_nlp_dataset.py b/paddlehub/datasets/base_nlp_dataset.py
index acca7b8c..1c9ae13a 100644
--- a/paddlehub/datasets/base_nlp_dataset.py
+++ b/paddlehub/datasets/base_nlp_dataset.py
@@ -246,15 +246,9 @@ class TextClassificationDataset(BaseNLPDataset, paddle.io.Dataset):
     def __getitem__(self, idx):
         record = self.records[idx]
         if 'label' in record.keys():
-            if isinstance(self.tokenizer, BertTokenizer):
-                return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'])
-            elif isinstance(self.tokenizer, CustomTokenizer):
-                return np.array(record['text']), np.array(record['seq_len']), np.array(record['label'])
+            return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'], dtype=np.int64)
         else:
-            if isinstance(self.tokenizer, BertTokenizer):
-                return np.array(record['input_ids']), np.array(record['segment_ids'])
-            elif isinstance(self.tokenizer, CustomTokenizer):
-                return np.array(record['text']), np.array(record['seq_len'])
+            return np.array(record['input_ids']), np.array(record['segment_ids'])
 
     def __len__(self):
         return len(self.records)
@@ -269,8 +263,9 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
                  data_file: str = None,
                  label_file: str = None,
                  label_list: list = None,
-                 split_char="\002",
-                 no_entity_label="O",
+                 split_char: str ="\002",
+                 no_entity_label: str = "O",
+                 ignore_label: int = -100,
                  is_file_with_header: bool = False):
         super(SeqLabelingDataset, self).__init__(
             base_path=base_path,
@@ -283,6 +278,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
 
         self.no_entity_label = no_entity_label
         self.split_char = split_char
+        self.ignore_label = ignore_label
 
         self.examples = self._read_file(self.data_file, is_file_with_header)
         self.records = self._convert_examples_to_records(self.examples)
@@ -327,8 +323,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
                 continue
             if labels:
                 record["label"] = []
-                tokens_with_specical_token = self.tokenizer.decode(
-                    record, only_convert_to_tokens=True)
+                tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(record['input_ids'])
                 tokens_index = 0
                 for token in tokens_with_specical_token:
                     if tokens_index < len(
@@ -336,6 +331,8 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
                         record["label"].append(
                             self.label_list.index(labels[tokens_index]))
                         tokens_index += 1
+                    elif token in [self.tokenizer.pad_token]:
+                        record["label"].append(self.ignore_label)  # label of special token
                     else:
                         record["label"].append(
                             self.label_list.index(self.no_entity_label))
@@ -351,7 +348,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
             ret_tokens = []
             ret_labels = []
             for token, label in zip(tokens, labels):
-                sub_token = self.tokenizer.tokenize(token)
+                sub_token = self.tokenizer(token)
                 if len(sub_token) == 0:
                     continue
                 ret_tokens.extend(sub_token)
@@ -370,7 +367,7 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
         else:
             ret_tokens = []
             for token in tokens:
-                sub_token = self.tokenizer.tokenize(token)
+                sub_token = self.tokenizer(token)
                 if len(sub_token) == 0:
                     continue
                 ret_tokens.extend(sub_token)
@@ -381,15 +378,9 @@ class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
     def __getitem__(self, idx):
         record = self.records[idx]
         if 'label' in record.keys():
-            if isinstance(self.tokenizer, BertTokenizer):
-                return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'])
-            else:  # TODO(chenxiaojie): add CustomTokenizer supported
-                raise NotImplementedError
+            return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len']), np.array(record['label'], dtype=np.int64)
         else:
-            if isinstance(self.tokenizer, BertTokenizer):
-                return np.array(record['input_ids']), np.array(record['segment_ids'])
-            else:  # TODO(chenxiaojie): add CustomTokenizer supported
-                raise NotImplementedError
+            return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['seq_len'])
 
     def __len__(self):
         return len(self.records)
diff --git a/paddlehub/datasets/msra_ner.py b/paddlehub/datasets/msra_ner.py
index c9f4b2e3..8440e7c0 100644
--- a/paddlehub/datasets/msra_ner.py
+++ b/paddlehub/datasets/msra_ner.py
@@ -31,8 +31,16 @@ class MSRA_NER(SeqLabelingDataset):
     for research purposes.  For more information please refer to
     https://www.microsoft.com/en-us/download/details.aspx?id=52531
     """
-    def __init__(self, tokenizer: Union[BertTokenizer, CustomTokenizer], max_seq_len: int = 128, mode: str = 'train'):
+
+    def __init__(
+            self,
+            tokenizer: Union[BertTokenizer, CustomTokenizer],
+            max_seq_len: int = 128,
+            mode: str = 'train',
+    ):
         base_path = os.path.join(DATA_HOME, "msra_ner")
+        label_list = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
+
         if mode == 'train':
             data_file = 'train.tsv'
         elif mode == 'test':
@@ -46,6 +54,6 @@ class MSRA_NER(SeqLabelingDataset):
             mode=mode,
             data_file=data_file,
             label_file=None,
-            label_list=["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"],
+            label_list=label_list,
             is_file_with_header=True,
         )
diff --git a/paddlehub/module/modeling_bert.py b/paddlehub/module/modeling_bert.py
deleted file mode 100644
index 5ab602b8..00000000
--- a/paddlehub/module/modeling_bert.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# FIXME(zhangxuefei): remove this file after paddlenlp is released.
-
-import paddle
-import paddle.nn as nn
-
-from paddlehub.module.nlp_module import PretrainedModel, register_base_model
-
-
-class BertEmbeddings(nn.Layer):
-    """
-    Include embeddings from word, position and token_type embeddings
-    """
-
-    def __init__(self,
-                 vocab_size,
-                 hidden_size=768,
-                 hidden_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=16):
-        super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
-        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
-        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
-        self.layer_norm = nn.LayerNorm(hidden_size)
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
-        if position_ids is None:
-            # maybe need use shape op to unify static graph and dynamic graph
-            seq_length = input_ids.shape[1]
-            position_ids = paddle.arange(0, seq_length, dtype="int64")
-        if token_type_ids is None:
-            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
-
-        input_embedings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = input_embedings + position_embeddings + token_type_embeddings
-        embeddings = self.layer_norm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertPooler(nn.Layer):
-    """
-    """
-
-    def __init__(self, hidden_size):
-        super(BertPooler, self).__init__()
-        self.dense = nn.Linear(hidden_size, hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPretrainedModel(PretrainedModel):
-    """
-    An abstract class for pretrained BERT models. It provides BERT related
-    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
-    `pretrained_init_configuration`, `base_model_prefix` for downloading and
-    loading pretrained models. See `PretrainedModel` for more details.
-    """
-
-    model_config_file = "model_config.json"
-    pretrained_init_configuration = {
-        "bert-base-uncased": {
-            "vocab_size": 30522,
-            "hidden_size": 768,
-            "num_hidden_layers": 12,
-            "num_attention_heads": 12,
-            "intermediate_size": 3072,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 2,
-            "initializer_range": 0.02,
-            "pad_token_id": 0,
-        },
-        "bert-large-uncased": {
-            "vocab_size": 30522,
-            "hidden_size": 1024,
-            "num_hidden_layers": 24,
-            "num_attention_heads": 16,
-            "intermediate_size": 4096,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 2,
-            "initializer_range": 0.02,
-            "pad_token_id": 0,
-        },
-        "bert-base-multilingual-uncased": {
-            "vocab_size": 105879,
-            "hidden_size": 768,
-            "num_hidden_layers": 12,
-            "num_attention_heads": 12,
-            "intermediate_size": 3072,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 2,
-            "initializer_range": 0.02,
-            "pad_token_id": 0,
-        },
-        "bert-base-cased": {
-            "vocab_size": 30522,
-            "hidden_size": 768,
-            "num_hidden_layers": 12,
-            "num_attention_heads": 12,
-            "intermediate_size": 3072,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 2,
-            "initializer_range": 0.02,
-            "pad_token_id": 0,
-        },
-        "bert-base-chinese": {
-            "vocab_size": 21128,
-            "hidden_size": 768,
-            "num_hidden_layers": 12,
-            "num_attention_heads": 12,
-            "intermediate_size": 3072,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 2,
-            "initializer_range": 0.02,
-            "pad_token_id": 0,
-        },
-        "bert-base-multilingual-cased": {
-            "vocab_size": 119547,
-            "hidden_size": 768,
-            "num_hidden_layers": 12,
-            "num_attention_heads": 12,
-            "intermediate_size": 3072,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 2,
-            "initializer_range": 0.02,
-            "pad_token_id": 0,
-        },
-        "bert-large-cased": {
-            "vocab_size": 28996,
-            "hidden_size": 1024,
-            "num_hidden_layers": 24,
-            "num_attention_heads": 16,
-            "intermediate_size": 4096,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 2,
-            "initializer_range": 0.02,
-            "pad_token_id": 0,
-        },
-    }
-    resource_files_names = {"model_state": "model_state.pdparams"}
-    pretrained_resource_files_map = {
-        "model_state": {
-            "bert-base-uncased": "https://paddlenlp.bj.bcebos.com/models/transformers/bert-base-uncased.pdparams",
-            "bert-large-uncased": "https://paddlenlp.bj.bcebos.com/models/transformers/bert-large-uncased.pdparams",
-            "bert-base-multilingual-uncased":
-            "http://paddlenlp.bj.bcebos.com/models/transformers/bert-base-multilingual-uncased.pdparams",
-            "bert-base-cased": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-cased.pdparams",
-            "bert-base-chinese": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-chinese.pdparams",
-            "bert-base-multilingual-cased":
-            "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-base-multilingual-cased.pdparamss",
-            "bert-large-cased": "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-large-cased.pdparams"
-        }
-    }
-    base_model_prefix = "bert"
-
-    def init_weights(self, layer):
-        """ Initialization hook """
-        if isinstance(layer, (nn.Linear, nn.Embedding)):
-            # only support dygraph, use truncated_normal and make it inplace
-            # and configurable later
-            layer.weight.set_value(
-                paddle.tensor.normal(
-                    mean=0.0,
-                    std=self.initializer_range
-                    if hasattr(self, "initializer_range") else self.bert.config["initializer_range"],
-                    shape=layer.weight.shape))
-        elif isinstance(layer, nn.LayerNorm):
-            layer._epsilon = 1e-12
-
-
-@register_base_model
-class BertModel(BertPretrainedModel):
-    """
-    """
-
-    def __init__(self,
-                 vocab_size,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=16,
-                 initializer_range=0.02,
-                 pad_token_id=0):
-        super(BertModel, self).__init__()
-        self.pad_token_id = pad_token_id
-        self.initializer_range = initializer_range
-        self.embeddings = BertEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings,
-                                         type_vocab_size)
-        encoder_layer = nn.TransformerEncoderLayer(
-            hidden_size,
-            num_attention_heads,
-            intermediate_size,
-            dropout=hidden_dropout_prob,
-            activation=hidden_act,
-            attn_dropout=attention_probs_dropout_prob,
-            act_dropout=0)
-        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
-        self.pooler = BertPooler(hidden_size)
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
-        if attention_mask is None:
-            attention_mask = paddle.unsqueeze(
-                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2])
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder(embedding_output, attention_mask)
-        sequence_output = encoder_outputs
-        pooled_output = self.pooler(sequence_output)
-        return sequence_output, pooled_output
-
-
-class BertForSequenceClassification(BertPretrainedModel):
-    """
-    Model for sentence (pair) classification task with BERT.
-    Args:
-        bert (BertModel): An instance of BertModel.
-        num_classes (int, optional): The number of classes. Default 2
-        dropout (float, optional): The dropout probability for output of BERT.
-            If None, use the same value as `hidden_dropout_prob` of `BertModel`
-            instance `bert`. Default None
-    """
-
-    def __init__(self, bert, num_classes=2, dropout=None):
-        super(BertForSequenceClassification, self).__init__()
-        self.num_classes = num_classes
-        self.bert = bert  # allow bert to be config
-        self.dropout = nn.Dropout(dropout if dropout is not None else self.bert.config["hidden_dropout_prob"])
-        self.classifier = nn.Linear(self.bert.config["hidden_size"], num_classes)
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
-        _, pooled_output = self.bert(
-            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        return logits
diff --git a/paddlehub/module/modeling_ernie.py b/paddlehub/module/modeling_ernie.py
deleted file mode 100644
index ef43785b..00000000
--- a/paddlehub/module/modeling_ernie.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# FIXME(zhangxuefei): remove this file after paddlenlp is released.
-
-import paddle
-import paddle.nn as nn
-
-from paddlehub.module.nlp_module import PretrainedModel, register_base_model
-
-
-class ErnieEmbeddings(nn.Layer):
-    """
-    Include embeddings from word, position and token_type embeddings
-    """
-
-    def __init__(self,
-                 vocab_size,
-                 hidden_size=768,
-                 hidden_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 pad_token_id=0):
-        super(ErnieEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id)
-        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
-        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
-        self.layer_norm = nn.LayerNorm(hidden_size)
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
-        if position_ids is None:
-            # maybe need use shape op to unify static graph and dynamic graph
-            seq_length = input_ids.shape[1]
-            position_ids = paddle.arange(0, seq_length, dtype="int64")
-        if token_type_ids is None:
-            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
-
-        input_embedings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = input_embedings + position_embeddings + token_type_embeddings
-        embeddings = self.layer_norm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class ErniePooler(nn.Layer):
-    """
-    """
-
-    def __init__(self, hidden_size):
-        super(ErniePooler, self).__init__()
-        self.dense = nn.Linear(hidden_size, hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class ErniePretrainedModel(PretrainedModel):
-    """
-    An abstract class for pretrained ERNIE models. It provides ERNIE related
-    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
-    `pretrained_init_configuration`, `base_model_prefix` for downloading and
-    loading pretrained models. See `PretrainedModel` for more details.
-    """
-
-    model_config_file = "model_config.json"
-    pretrained_init_configuration = {
-        "ernie": {
-            "attention_probs_dropout_prob": 0.1,
-            "hidden_act": "relu",
-            "hidden_dropout_prob": 0.1,
-            "hidden_size": 768,
-            "initializer_range": 0.02,
-            "max_position_embeddings": 513,
-            "num_attention_heads": 12,
-            "num_hidden_layers": 12,
-            "type_vocab_size": 2,
-            "vocab_size": 18000,
-            "pad_token_id": 0,
-        },
-        "ernie_tiny": {
-            "attention_probs_dropout_prob": 0.1,
-            "hidden_act": "relu",
-            "hidden_dropout_prob": 0.1,
-            "hidden_size": 1024,
-            "initializer_range": 0.02,
-            "intermediate_size": 4096,
-            "max_position_embeddings": 600,
-            "num_attention_heads": 16,
-            "num_hidden_layers": 3,
-            "type_vocab_size": 2,
-            "vocab_size": 50006,
-            "pad_token_id": 0,
-        },
-        "ernie_v2_eng_base": {
-            "attention_probs_dropout_prob": 0.1,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "hidden_size": 768,
-            "initializer_range": 0.02,
-            "max_position_embeddings": 512,
-            "num_attention_heads": 12,
-            "num_hidden_layers": 12,
-            "type_vocab_size": 4,
-            "vocab_size": 30522,
-            "pad_token_id": 0,
-        },
-        "ernie_v2_eng_large": {
-            "attention_probs_dropout_prob": 0.1,
-            "intermediate_size": 4096,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "hidden_size": 1024,
-            "initializer_range": 0.02,
-            "max_position_embeddings": 512,
-            "num_attention_heads": 16,
-            "num_hidden_layers": 24,
-            "type_vocab_size": 4,
-            "vocab_size": 30522,
-            "pad_token_id": 0,
-        },
-    }
-    resource_files_names = {"model_state": "model_state.pdparams"}
-    pretrained_resource_files_map = {
-        "model_state": {
-            "ernie":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie/ernie_v1_chn_base.pdparams",
-            "ernie_tiny":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_tiny/ernie_tiny.pdparams",
-            "ernie_v2_eng_base":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_base/ernie_v2_eng_base.pdparams",
-            "ernie_v2_eng_large":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_v2_large/ernie_v2_eng_large.pdparams",
-        }
-    }
-    base_model_prefix = "ernie"
-
-    def init_weights(self, layer):
-        """ Initialization hook """
-        if isinstance(layer, (nn.Linear, nn.Embedding)):
-            # only support dygraph, use truncated_normal and make it inplace
-            # and configurable later
-            layer.weight.set_value(
-                paddle.tensor.normal(
-                    mean=0.0,
-                    std=self.initializer_range
-                    if hasattr(self, "initializer_range") else self.ernie.config["initializer_range"],
-                    shape=layer.weight.shape))
-
-
-@register_base_model
-class ErnieModel(ErniePretrainedModel):
-    """
-    """
-
-    def __init__(self,
-                 vocab_size,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=16,
-                 initializer_range=0.02,
-                 pad_token_id=0):
-        super(ErnieModel, self).__init__()
-        self.pad_token_id = pad_token_id
-        self.initializer_range = initializer_range
-        self.embeddings = ErnieEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings,
-                                          type_vocab_size, pad_token_id)
-        encoder_layer = nn.TransformerEncoderLayer(
-            hidden_size,
-            num_attention_heads,
-            intermediate_size,
-            dropout=hidden_dropout_prob,
-            activation=hidden_act,
-            attn_dropout=attention_probs_dropout_prob,
-            act_dropout=0)
-        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
-        self.pooler = ErniePooler(hidden_size)
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
-        if attention_mask is None:
-            attention_mask = paddle.unsqueeze(
-                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2])
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder(embedding_output, attention_mask)
-        sequence_output = encoder_outputs
-        pooled_output = self.pooler(sequence_output)
-        return sequence_output, pooled_output
-
-
-class ErnieForSequenceClassification(ErniePretrainedModel):
-    """
-    Model for sentence (pair) classification task with ERNIE.
-    Args:
-        ernie (ErnieModel): An instance of `ErnieModel`.
-        num_classes (int, optional): The number of classes. Default 2
-        dropout (float, optional): The dropout probability for output of ERNIE.
-            If None, use the same value as `hidden_dropout_prob` of `ErnieModel`
-            instance `Ernie`. Default None
-    """
-
-    def __init__(self, ernie, num_classes=2, dropout=None):
-        super(ErnieForSequenceClassification, self).__init__()
-        self.num_classes = num_classes
-        self.ernie = ernie  # allow ernie to be config
-        self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.config["hidden_dropout_prob"])
-        self.classifier = nn.Linear(self.ernie.config["hidden_size"], num_classes)
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
-        _, pooled_output = self.ernie(
-            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        return logits
diff --git a/paddlehub/module/modeling_roberta.py b/paddlehub/module/modeling_roberta.py
deleted file mode 100644
index 62d75539..00000000
--- a/paddlehub/module/modeling_roberta.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# FIXME(zhangxuefei): remove this file after paddlenlp is released.
-
-import paddle
-import paddle.nn as nn
-
-from paddlehub.module.nlp_module import PretrainedModel, register_base_model
-
-
-class RobertaEmbeddings(nn.Layer):
-    """
-    Include embeddings from word, position and token_type embeddings
-    """
-
-    def __init__(self,
-                 vocab_size,
-                 hidden_size=768,
-                 hidden_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=16,
-                 pad_token_id=0):
-        super(RobertaEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id)
-        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
-        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
-        self.layer_norm = nn.LayerNorm(hidden_size)
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
-        if position_ids is None:
-            # maybe need use shape op to unify static graph and dynamic graph
-            seq_length = input_ids.shape[1]
-            position_ids = paddle.arange(0, seq_length, dtype="int64")
-        if token_type_ids is None:
-            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
-
-        input_embedings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = input_embedings + position_embeddings + token_type_embeddings
-        embeddings = self.layer_norm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class RobertaPooler(nn.Layer):
-    """
-    """
-
-    def __init__(self, hidden_size):
-        super(RobertaPooler, self).__init__()
-        self.dense = nn.Linear(hidden_size, hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class RobertaPretrainedModel(PretrainedModel):
-    """
-    An abstract class for pretrained RoBERTa models. It provides RoBERTa related
-    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
-    `pretrained_init_configuration`, `base_model_prefix` for downloading and
-    loading pretrained models. See `PretrainedModel` for more details.
-    """
-
-    model_config_file = "model_config.json"
-    pretrained_init_configuration = {
-        "roberta-wwm-ext": {
-            "attention_probs_dropout_prob": 0.1,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "hidden_size": 768,
-            "initializer_range": 0.02,
-            "intermediate_size": 3072,
-            "max_position_embeddings": 512,
-            "num_attention_heads": 12,
-            "num_hidden_layers": 12,
-            "type_vocab_size": 2,
-            "vocab_size": 21128,
-            "pad_token_id": 0
-        },
-        "roberta-wwm-ext-large": {
-            "attention_probs_dropout_prob": 0.1,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "hidden_size": 1024,
-            "initializer_range": 0.02,
-            "intermediate_size": 4096,
-            "max_position_embeddings": 512,
-            "num_attention_heads": 16,
-            "num_hidden_layers": 24,
-            "type_vocab_size": 2,
-            "vocab_size": 21128,
-            "pad_token_id": 0
-        }
-    }
-    resource_files_names = {"model_state": "model_state.pdparams"}
-    pretrained_resource_files_map = {
-        "model_state": {
-            "roberta-wwm-ext":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_base/roberta_chn_base.pdparams",
-            "roberta-wwm-ext-large":
-            "https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/roberta_chn_large.pdparams",
-        }
-    }
-    base_model_prefix = "roberta"
-
-    def init_weights(self, layer):
-        """ Initialization hook """
-        if isinstance(layer, (nn.Linear, nn.Embedding)):
-            # only support dygraph, use truncated_normal and make it inplace
-            # and configurable later
-            layer.weight.set_value(
-                paddle.tensor.normal(
-                    mean=0.0,
-                    std=self.initializer_range
-                    if hasattr(self, "initializer_range") else self.roberta.config["initializer_range"],
-                    shape=layer.weight.shape))
-        elif isinstance(layer, nn.LayerNorm):
-            layer._epsilon = 1e-12
-
-
-@register_base_model
-class RobertaModel(RobertaPretrainedModel):
-    """
-    """
-
-    def __init__(self,
-                 vocab_size,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=16,
-                 initializer_range=0.02,
-                 pad_token_id=0):
-        super(RobertaModel, self).__init__()
-        self.pad_token_id = pad_token_id
-        self.initializer_range = initializer_range
-        self.embeddings = RobertaEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings,
-                                            type_vocab_size, pad_token_id)
-        encoder_layer = nn.TransformerEncoderLayer(
-            hidden_size,
-            num_attention_heads,
-            intermediate_size,
-            dropout=hidden_dropout_prob,
-            activation=hidden_act,
-            attn_dropout=attention_probs_dropout_prob,
-            act_dropout=0)
-        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
-        self.pooler = RobertaPooler(hidden_size)
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
-        if attention_mask is None:
-            attention_mask = paddle.unsqueeze(
-                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e9, axis=[1, 2])
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder(embedding_output, attention_mask)
-        sequence_output = encoder_outputs
-        pooled_output = self.pooler(sequence_output)
-        return sequence_output, pooled_output
-
-
-class RobertaForSequenceClassification(RobertaPretrainedModel):
-    """
-    Model for sentence (pair) classification task with RoBERTa.
-    Args:
-        roberta (RobertaModel): An instance of `RobertaModel`.
-        num_classes (int, optional): The number of classes. Default 2
-        dropout (float, optional): The dropout probability for output of RoBERTa.
-            If None, use the same value as `hidden_dropout_prob` of `RobertaModel`
-            instance `Roberta`. Default None
-    """
-
-    def __init__(self, roberta, num_classes=2, dropout=None):
-        super(RobertaForSequenceClassification, self).__init__()
-        self.num_classes = num_classes
-        self.roberta = roberta  # allow roberta to be config
-        self.dropout = nn.Dropout(dropout if dropout is not None else self.roberta.config["hidden_dropout_prob"])
-        self.classifier = nn.Linear(self.roberta.config["hidden_size"], num_classes)
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
-        _, pooled_output = self.roberta(
-            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        return logits
diff --git a/paddlehub/module/nlp_module.py b/paddlehub/module/nlp_module.py
index 37c86f3a..ddfd546c 100644
--- a/paddlehub/module/nlp_module.py
+++ b/paddlehub/module/nlp_module.py
@@ -453,8 +453,11 @@ class TransformerModule(RunModule, TextServing):
         Returns:
             results(:obj: Dict) : The model outputs, such as loss and metrics.
         """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'loss': avg_loss, 'metrics': {'acc': acc}}
+        if self.task == 'seq-cls':
+            predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
+        elif self.task == 'token-cls':
+            predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3])
+        return {'loss': avg_loss, 'metrics': metric}
 
     def validation_step(self, batch: List[paddle.Tensor], batch_idx: int):
         """
@@ -466,8 +469,11 @@ class TransformerModule(RunModule, TextServing):
         Returns:
             results(:obj: Dict) : The model outputs, such as metrics.
         """
-        predictions, avg_loss, acc = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
-        return {'metrics': {'acc': acc}}
+        if self.task == 'seq-cls':
+            predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], labels=batch[2])
+        elif self.task == 'token-cls':
+            predictions, avg_loss, metric = self(input_ids=batch[0], token_type_ids=batch[1], seq_lengths=batch[2], labels=batch[3])
+        return {'metrics': metric}
 
     def get_embedding(self, data: List[List[str]], max_seq_len=128, batch_size=1, use_gpu=False):
         """
diff --git a/requirements.txt b/requirements.txt
index fe998426..8102079e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,4 @@ tqdm
 visualdl >= 2.0.0
 # gunicorn not support windows
 gunicorn >= 19.10.0; sys_platform != "win32"
-paddlenlp >= 2.0.0b
\ No newline at end of file
+paddlenlp >= 2.0.0b2
\ No newline at end of file
-- 
GitLab