Add UnifiedTransformer.

221088ce · KP · GitHub · 53c3e6d2 · 221088ce · 221088ce
15 changed file
--- a/modules/text/text_generation/plato-mini/README.md
+++ b/modules/text/text_generation/plato-mini/README.md
+```shell
+$ hub install plato_mini==1.0.0
+```
+
+## 概述
+
+近年来，人机对话系统受到了学术界和产业界的广泛关注并取得了不错的发展。开放域对话系统旨在建立一个开放域的多轮对话系统，使得机器可以流畅自然地与人进行语言交互，既可以进行日常问候类的闲聊，又可以完成特定功能，以使得开放域对话系统具有实际应用价值。具体的说，开放域对话可以继续拆分为支持不同功能的对话形式，例如对话式推荐，知识对话技术等，如何解决并有效融合以上多个技能面临诸多挑战。
+
+[UnifiedTransformer](https://arxiv.org/abs/2006.16779)以[Transformer](https://arxiv.org/abs/1706.03762) 编码器为网络基本组件，采用灵活的注意力机制，十分适合文本生成任务，并在模型输入中加入了标识不同对话技能的special token，使得模型能同时支持闲聊对话、推荐对话和知识对话。
+
+plato_mini包含6层的transformer结构，头数为12，隐藏层参数为768，参数量为89M。该模型在十亿级别的中文对话数据上进行预训练，通过PaddleHub加载后可直接用于对话任务。
+
+## API
+
+```python
+def predict(data: Union[List[List[str]], str],
+            max_seq_len: int = 512,
+            batch_size: int = 1,
+            use_gpu: bool = False,
+            **kwargs):
+```
+预测API，输入对话上下文，输出机器回复。
+
+**参数**
+- `data`(Union[List[List[str]], str]): 在非交互模式中，数据类型为List[List[str]]，每个样本是一个List[str]，表示为对话内容
+- `max_seq_len`(int): 每个样本的最大文本长度
+- `batch_size`(int): 进行预测的batch_size
+- `use_gpu`(bool): 是否使用gpu执行预测
+- `kwargs`: 预测时传给模型的额外参数，以keyword方式传递。其余的参数详情请查看[UnifiedTransformer](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/dialogue/unified_transformer)。
+
+**返回**
+* `results`(List[str]): 每个元素为相应对话中模型的新回复
+
+```python
+def interactive_mode(max_turn=3)
+```
+进入交互模式。交互模式中，predict接口的data将支持字符串类型。
+
+**参数**
+- `max_turn`(int): 模型能记忆的对话轮次，当`max_turn`为1时，模型只能记住当前对话，无法获知之前的对话内容。
+
+
+**代码示例**
+
+```python
+# 非交互模式
+import paddlehub as hub
+
+model = hub.Module(name='plato_mini')
+data = [["你是谁？"], ["你好啊。", "吃饭了吗？",]]
+result = model.predict(data)
+```
+
+```python
+# 交互模式
+import paddlehub as hub
+
+model = hub.Module(name='plato_mini')
+with model.interactive_mode(max_turn=3):
+    while True:
+        human_utterance = input("[Human]: ").strip()
+        robot_utterance = model.predict(human_utterance)[0]
+        print("[Bot]: %s"%robot_utterance)
+```
+
+## 服务部署
+
+PaddleHub Serving可以部署在线服务。
+
+### Step1: 启动PaddleHub Serving
+
+运行启动命令：
+
+```shell
+$ hub serving start -m plato_mini
+```
+
+这样就完成了一个对话机器人服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+texts = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+data = {"data": texts}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
+url = "http://127.0.0.1:8866/predict/plato_mini"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Knover
+
+## 依赖
+
+paddlepaddle >= 2.0.0
+
+paddlehub >= 2.1.0
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/modules/text/text_generation/plato-mini/__init__.py
+++ b/modules/text/text_generation/plato-mini/__init__.py
--- a/modules/text/text_generation/plato-mini/module.py
+++ b/modules/text/text_generation/plato-mini/module.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from collections import deque
+from typing import List, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.data import Pad
+from paddlenlp.transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
+
+from plato_mini.utils import select_response
+
+
+@moduleinfo(
+    name="plato-mini",
+    version="1.0.0",
+    summary="",
+    author="PaddlePaddle",
+    author_email="",
+    type="nlp/text_generation",
+)
+class UnifiedTransformer(nn.Layer):
+    def __init__(self):
+        super(UnifiedTransformer, self).__init__()
+
+        self.model = UnifiedTransformerLMHeadModel.from_pretrained('plato-mini')
+        self.tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
+        self._interactive_mode = False
+
+    def _convert_text_to_input(self, texts: List[str], max_seq_len: int):
+        """
+        Convert input strings to tokens.
+        """
+        return self.tokenizer.dialogue_encode(texts,
+                                              max_seq_len=max_seq_len,
+                                              add_start_token_as_response=True,
+                                              is_split_into_words=False)
+
+    def _batchify(self, data: List[List[str]], max_seq_len: int, batch_size: int):
+        """
+        Generate input batches.
+        """
+        padding = False if batch_size == 1 else True
+        pad_func = Pad(pad_val=self.tokenizer.pad_token_id, pad_right=False)
+
+        def pad_mask(batch_attention_mask):
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype='float32') * -1e9
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32')
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        def _parse_batch(batch_examples):
+            if padding:
+                input_ids = pad_func([example['input_ids'] for example in batch_examples])
+                token_type_ids = pad_func([example['token_type_ids'] for example in batch_examples])
+                position_ids = pad_func([example['position_ids'] for example in batch_examples])
+                attention_mask = pad_mask([example['attention_mask'] for example in batch_examples])
+            else:
+                input_ids = np.asarray([example['input_ids'] for example in batch_examples])
+                token_type_ids = np.asarray([example['token_type_ids'] for example in batch_examples])
+                position_ids = np.asarray([example['position_ids'] for example in batch_examples])
+                attention_mask = np.asarray([example['attention_mask'] for example in batch_examples])
+                attention_mask = np.expand_dims(attention_mask, 0)
+
+            return input_ids, token_type_ids, position_ids, attention_mask
+
+        examples = []
+        for texts in data:
+            examples.append(self._convert_text_to_input(texts, max_seq_len))
+
+        # Seperates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    @contextlib.contextmanager
+    def interactive_mode(self, max_turn=3):
+        """
+        Enter the interactive mode.
+        """
+        self._interactive_mode = True
+        self.max_turn = max_turn
+        self.context = deque(maxlen=self.max_turn)
+        yield
+        self.context.clear()
+        self._interactive_mode = False
+
+    def forward(self,
+                input_ids,
+                token_type_ids,
+                position_ids,
+                attention_mask,
+                max_length=64,
+                min_length=1,
+                decode_strategy='sampling',
+                temperature=1.0,
+                top_k=5,
+                top_p=1.0,
+                num_beams=0,
+                length_penalty=1.0,
+                early_stopping=False,
+                num_return_sequences=1):
+
+        ids, scores = self.model.generate(input_ids=input_ids,
+                                          token_type_ids=token_type_ids,
+                                          position_ids=position_ids,
+                                          attention_mask=attention_mask,
+                                          max_length=max_length,
+                                          min_length=min_length,
+                                          decode_strategy=decode_strategy,
+                                          temperature=temperature,
+                                          top_k=top_k,
+                                          top_p=top_p,
+                                          num_beams=num_beams,
+                                          length_penalty=length_penalty,
+                                          early_stopping=early_stopping,
+                                          num_return_sequences=num_return_sequences)
+
+        return ids, scores
+
+    @serving
+    def predict(self,
+                data: Union[List[List[str]], str],
+                max_seq_len: int = 512,
+                batch_size: int = 1,
+                use_gpu: bool = False,
+                **kwargs):
+
+        if self._interactive_mode:
+            if isinstance(data, str):
+                self.context.append(data.strip())
+                data = [list(self.context)]
+            else:
+                raise ValueError("In the interactive mode, the input data should be a string.")
+        elif not isinstance(data, list):
+            raise ValueError("If not in the interactive mode, the input data should be a list.")
+
+        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
+
+        batches = self._batchify(data, max_seq_len, batch_size)
+
+        results = []
+        self.eval()
+        for batch in batches:
+            input_ids, token_type_ids, position_ids, attention_mask = map(paddle.to_tensor, batch)
+            ids, scores = self(input_ids, token_type_ids, position_ids, attention_mask, **kwargs)
+            num_return_sequences = 1 if 'num_return_sequences' not in kwargs\
+                else kwargs['num_return_sequences']
+            results.extend(
+                select_response(ids,
+                                scores,
+                                self.tokenizer,
+                                num_return_sequences=num_return_sequences,
+                                keep_space=False))
+
+        if self._interactive_mode:
+            self.context.append(results[0].strip())
+
+        return results
--- a/modules/text/text_generation/plato-mini/requirements.txt
+++ b/modules/text/text_generation/plato-mini/requirements.txt
+sentencepiece
--- a/modules/text/text_generation/plato-mini/utils.py
+++ b/modules/text/text_generation/plato-mini/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+
+def post_process_response(token_ids: List[int], tokenizer):
+    '''
+    Post-process the decoded sequence. Truncate from the first <eos>.
+    '''
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.sep_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    return token_ids, tokens
+
+
+def get_in_turn_repetition(pred: List[str], is_cn: bool = False):
+    '''
+    Get in-turn repetition.
+    '''
+    if len(pred) == 0:
+        return 1.0
+    if isinstance(pred[0], str):
+        pred = [tok.lower() for tok in pred]
+        if is_cn:
+            pred = "".join(pred)
+    tri_grams = set()
+    for i in range(len(pred) - 2):
+        tri_gram = tuple(pred[i:i + 3])
+        if tri_gram in tri_grams:
+            return True
+        tri_grams.add(tri_gram)
+    return False
+
+
+def select_response(ids,
+                    scores: List[float],
+                    tokenizer,
+                    max_dec_len: int = None,
+                    num_return_sequences: int = 1,
+                    keep_space: bool = True):
+    '''
+    Select response with the highest score.
+    '''
+    ids = ids.numpy().tolist()
+    scores = scores.numpy()
+
+    if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+        raise ValueError("the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+            len(ids), num_return_sequences))
+
+    group = []
+    tmp = []
+    for pred, score in zip(ids, scores):
+        pred_token_ids, pred_tokens = post_process_response(pred, tokenizer)
+        num_token = len(pred_token_ids)
+        if keep_space:
+            response = " ".join(pred_tokens)
+        else:
+            response = "".join(pred_tokens)
+
+        in_turn_repetition = get_in_turn_repetition(pred_tokens, True) or get_in_turn_repetition(pred_token_ids)
+        # not ending
+        if max_dec_len is not None and num_token >= max_dec_len:
+            score -= 1e3
+        elif in_turn_repetition:
+            score -= 1e3
+
+        tmp.append([response, score])
+        if len(tmp) == num_return_sequences:
+            group.append(tmp)
+            tmp = []
+
+    results = []
+    for preds in group:
+        preds = sorted(preds, key=lambda x: -x[1])
+        results.append(preds[0][0])
+    return results
--- a/modules/text/text_generation/unified_transformer-12L-cn-luge/README.md
+++ b/modules/text/text_generation/unified_transformer-12L-cn-luge/README.md
+```shell
+$ hub install unified_transformer_12L_cn_luge==1.0.0
+```
+
+## 概述
+
+近年来，人机对话系统受到了学术界和产业界的广泛关注并取得了不错的发展。开放域对话系统旨在建立一个开放域的多轮对话系统，使得机器可以流畅自然地与人进行语言交互，既可以进行日常问候类的闲聊，又可以完成特定功能，以使得开放域对话系统具有实际应用价值。具体的说，开放域对话可以继续拆分为支持不同功能的对话形式，例如对话式推荐，知识对话技术等，如何解决并有效融合以上多个技能面临诸多挑战。
+
+[UnifiedTransformer](https://arxiv.org/abs/2006.16779)以[Transformer](https://arxiv.org/abs/1706.03762) 编码器为网络基本组件，采用灵活的注意力机制，十分适合文本生成任务，并在模型输入中加入了标识不同对话技能的special token，使得模型能同时支持闲聊对话、推荐对话和知识对话。
+
+unified_transformer_12L_cn_luge包含12层的transformer结构，头数为12，隐藏层参数为768，参数量为132M。该预训练模型使用了样本量为60M的文本数据和20M的对话数据的大型中文对话数据集进行预训练，然后在luge-dialogue的训练集合上进行finetune，具体训练详情可以查看[LUGE-Dialogue](https://github.com/PaddlePaddle/Knover/tree/luge-dialogue/luge-dialogue)。
+
+## API
+
+```python
+def predict(data: Union[List[List[str]], str],
+            max_seq_len: int = 512,
+            batch_size: int = 1,
+            use_gpu: bool = False,
+            **kwargs):
+```
+预测API，输入对话上下文，输出机器回复。
+
+**参数**
+- `data`(Union[List[List[str]], str]): 在非交互模式中，数据类型为List[List[str]]，每个样本是一个List[str]，表示为对话内容
+- `max_seq_len`(int): 每个样本的最大文本长度
+- `batch_size`(int): 进行预测的batch_size
+- `use_gpu`(bool): 是否使用gpu执行预测
+- `kwargs`: 预测时传给模型的额外参数，以keyword方式传递。其余的参数详情请查看[UnifiedTransformer](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/dialogue/unified_transformer)。
+
+**返回**
+* `results`(List[str]): 每个元素为相应对话中模型的新回复
+
+```python
+def interactive_mode(max_turn=3)
+```
+进入交互模式。交互模式中，predict接口的data将支持字符串类型。
+
+**参数**
+- `max_turn`(int): 模型能记忆的对话轮次，当`max_turn`为1时，模型只能记住当前对话，无法获知之前的对话内容。
+
+
+**代码示例**
+
+```python
+# 非交互模式
+import paddlehub as hub
+
+model = hub.Module(name='unified_transformer_12L_cn_luge')
+data = [["你是谁？"], ["你好啊。", "吃饭了吗？",]]
+result = model.predict(data)
+```
+
+```python
+# 交互模式
+import paddlehub as hub
+
+model = hub.Module(name='unified_transformer_12L_cn_luge')
+with model.interactive_mode(max_turn=3):
+    while True:
+        human_utterance = input("[Human]: ").strip()
+        robot_utterance = model.predict(human_utterance)[0]
+        print("[Bot]: %s"%robot_utterance)
+```
+
+## 服务部署
+
+PaddleHub Serving可以部署在线服务。
+
+### Step1: 启动PaddleHub Serving
+
+运行启动命令：
+
+```shell
+$ hub serving start -m unified_transformer_12L_cn_luge
+```
+
+这样就完成了一个对话机器人服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+texts = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+data = {"data": texts}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
+url = "http://127.0.0.1:8866/predict/unified_transformer_12L_cn_luge"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Knover
+
+## 依赖
+
+paddlepaddle >= 2.0.0
+
+paddlehub >= 2.1.0
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/modules/text/text_generation/unified_transformer-12L-cn-luge/__init__.py
+++ b/modules/text/text_generation/unified_transformer-12L-cn-luge/__init__.py
--- a/modules/text/text_generation/unified_transformer-12L-cn-luge/module.py
+++ b/modules/text/text_generation/unified_transformer-12L-cn-luge/module.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from collections import deque
+from typing import List, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.data import Pad
+from paddlenlp.transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
+
+from unified_transformer_12L_cn_luge.utils import select_response
+
+
+@moduleinfo(
+    name="unified_transformer_12L_cn_luge",
+    version="1.0.0",
+    summary="",
+    author="PaddlePaddle",
+    author_email="",
+    type="nlp/text_generation",
+)
+class UnifiedTransformer(nn.Layer):
+    def __init__(self):
+        super(UnifiedTransformer, self).__init__()
+
+        self.model = UnifiedTransformerLMHeadModel.from_pretrained('unified_transformer-12L-cn-luge')
+        self.tokenizer = UnifiedTransformerTokenizer.from_pretrained('unified_transformer-12L-cn-luge')
+        self._interactive_mode = False
+
+    def _convert_text_to_input(self, texts: List[str], max_seq_len: int):
+        """
+        Convert input strings to tokens.
+        """
+        return self.tokenizer.dialogue_encode(texts,
+                                              max_seq_len=max_seq_len,
+                                              add_start_token_as_response=True,
+                                              is_split_into_words=False)
+
+    def _batchify(self, data: List[List[str]], max_seq_len: int, batch_size: int):
+        """
+        Generate input batches.
+        """
+        padding = False if batch_size == 1 else True
+        pad_func = Pad(pad_val=self.tokenizer.pad_token_id, pad_right=False)
+
+        def pad_mask(batch_attention_mask):
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype='float32') * -1e9
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32')
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        def _parse_batch(batch_examples):
+            if padding:
+                input_ids = pad_func([example['input_ids'] for example in batch_examples])
+                token_type_ids = pad_func([example['token_type_ids'] for example in batch_examples])
+                position_ids = pad_func([example['position_ids'] for example in batch_examples])
+                attention_mask = pad_mask([example['attention_mask'] for example in batch_examples])
+            else:
+                input_ids = np.asarray([example['input_ids'] for example in batch_examples])
+                token_type_ids = np.asarray([example['token_type_ids'] for example in batch_examples])
+                position_ids = np.asarray([example['position_ids'] for example in batch_examples])
+                attention_mask = np.asarray([example['attention_mask'] for example in batch_examples])
+                attention_mask = np.expand_dims(attention_mask, 0)
+
+            return input_ids, token_type_ids, position_ids, attention_mask
+
+        examples = []
+        for texts in data:
+            examples.append(self._convert_text_to_input(texts, max_seq_len))
+
+        # Seperates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    @contextlib.contextmanager
+    def interactive_mode(self, max_turn=3):
+        """
+        Enter the interactive mode.
+        """
+        self._interactive_mode = True
+        self.max_turn = max_turn
+        self.context = deque(maxlen=self.max_turn)
+        yield
+        self.context.clear()
+        self._interactive_mode = False
+
+    def forward(self,
+                input_ids,
+                token_type_ids,
+                position_ids,
+                attention_mask,
+                max_length=64,
+                min_length=1,
+                decode_strategy='sampling',
+                temperature=1.0,
+                top_k=5,
+                top_p=1.0,
+                num_beams=0,
+                length_penalty=1.0,
+                early_stopping=False,
+                num_return_sequences=1):
+
+        ids, scores = self.model.generate(input_ids=input_ids,
+                                          token_type_ids=token_type_ids,
+                                          position_ids=position_ids,
+                                          attention_mask=attention_mask,
+                                          max_length=max_length,
+                                          min_length=min_length,
+                                          decode_strategy=decode_strategy,
+                                          temperature=temperature,
+                                          top_k=top_k,
+                                          top_p=top_p,
+                                          num_beams=num_beams,
+                                          length_penalty=length_penalty,
+                                          early_stopping=early_stopping,
+                                          num_return_sequences=num_return_sequences)
+
+        return ids, scores
+
+    @serving
+    def predict(self,
+                data: Union[List[List[str]], str],
+                max_seq_len: int = 512,
+                batch_size: int = 1,
+                use_gpu: bool = False,
+                **kwargs):
+
+        if self._interactive_mode:
+            if isinstance(data, str):
+                self.context.append(data.strip())
+                data = [list(self.context)]
+            else:
+                raise ValueError("In the interactive mode, the input data should be a string.")
+        elif not isinstance(data, list):
+            raise ValueError("If not in the interactive mode, the input data should be a list.")
+
+        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
+
+        batches = self._batchify(data, max_seq_len, batch_size)
+
+        results = []
+        self.eval()
+        for batch in batches:
+            input_ids, token_type_ids, position_ids, attention_mask = map(paddle.to_tensor, batch)
+            ids, scores = self(input_ids, token_type_ids, position_ids, attention_mask, **kwargs)
+            num_return_sequences = 1 if 'num_return_sequences' not in kwargs\
+                else kwargs['num_return_sequences']
+            results.extend(
+                select_response(ids,
+                                scores,
+                                self.tokenizer,
+                                num_return_sequences=num_return_sequences,
+                                keep_space=False))
+
+        if self._interactive_mode:
+            self.context.append(results[0].strip())
+
+        return results
--- a/modules/text/text_generation/unified_transformer-12L-cn-luge/requirements.txt
+++ b/modules/text/text_generation/unified_transformer-12L-cn-luge/requirements.txt
+sentencepiece
--- a/modules/text/text_generation/unified_transformer-12L-cn-luge/utils.py
+++ b/modules/text/text_generation/unified_transformer-12L-cn-luge/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+
+def post_process_response(token_ids: List[int], tokenizer):
+    '''
+    Post-process the decoded sequence. Truncate from the first <eos>.
+    '''
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.sep_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    return token_ids, tokens
+
+
+def get_in_turn_repetition(pred: List[str], is_cn: bool = False):
+    '''
+    Get in-turn repetition.
+    '''
+    if len(pred) == 0:
+        return 1.0
+    if isinstance(pred[0], str):
+        pred = [tok.lower() for tok in pred]
+        if is_cn:
+            pred = "".join(pred)
+    tri_grams = set()
+    for i in range(len(pred) - 2):
+        tri_gram = tuple(pred[i:i + 3])
+        if tri_gram in tri_grams:
+            return True
+        tri_grams.add(tri_gram)
+    return False
+
+
+def select_response(ids,
+                    scores: List[float],
+                    tokenizer,
+                    max_dec_len: int = None,
+                    num_return_sequences: int = 1,
+                    keep_space: bool = True):
+    '''
+    Select response with the highest score.
+    '''
+    ids = ids.numpy().tolist()
+    scores = scores.numpy()
+
+    if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+        raise ValueError("the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+            len(ids), num_return_sequences))
+
+    group = []
+    tmp = []
+    for pred, score in zip(ids, scores):
+        pred_token_ids, pred_tokens = post_process_response(pred, tokenizer)
+        num_token = len(pred_token_ids)
+        if keep_space:
+            response = " ".join(pred_tokens)
+        else:
+            response = "".join(pred_tokens)
+
+        in_turn_repetition = get_in_turn_repetition(pred_tokens, True) or get_in_turn_repetition(pred_token_ids)
+        # not ending
+        if max_dec_len is not None and num_token >= max_dec_len:
+            score -= 1e3
+        elif in_turn_repetition:
+            score -= 1e3
+
+        tmp.append([response, score])
+        if len(tmp) == num_return_sequences:
+            group.append(tmp)
+            tmp = []
+
+    results = []
+    for preds in group:
+        preds = sorted(preds, key=lambda x: -x[1])
+        results.append(preds[0][0])
+    return results
--- a/modules/text/text_generation/unified_transformer-12L-cn/README.md
+++ b/modules/text/text_generation/unified_transformer-12L-cn/README.md
+```shell
+$ hub install unified_transformer_12L_cn==1.0.0
+```
+
+## 概述
+
+近年来，人机对话系统受到了学术界和产业界的广泛关注并取得了不错的发展。开放域对话系统旨在建立一个开放域的多轮对话系统，使得机器可以流畅自然地与人进行语言交互，既可以进行日常问候类的闲聊，又可以完成特定功能，以使得开放域对话系统具有实际应用价值。具体的说，开放域对话可以继续拆分为支持不同功能的对话形式，例如对话式推荐，知识对话技术等，如何解决并有效融合以上多个技能面临诸多挑战。
+
+[UnifiedTransformer](https://arxiv.org/abs/2006.16779)以[Transformer](https://arxiv.org/abs/1706.03762) 编码器为网络基本组件，采用灵活的注意力机制，十分适合文本生成任务，并在模型输入中加入了标识不同对话技能的special token，使得模型能同时支持闲聊对话、推荐对话和知识对话。
+
+unified_transformer_12L_cn包含12层的transformer结构，头数为12，隐藏层参数为768，参数量为132M。该预训练模型使用了样本量为60M的文本数据和20M的对话数据的大型中文对话数据集进行预训练，具体训练详情可以查看[LUGE-Dialogue](https://github.com/PaddlePaddle/Knover/tree/luge-dialogue/luge-dialogue)。
+
+## API
+
+```python
+def predict(data: Union[List[List[str]], str],
+            max_seq_len: int = 512,
+            batch_size: int = 1,
+            use_gpu: bool = False,
+            **kwargs):
+```
+预测API，输入对话上下文，输出机器回复。
+
+**参数**
+- `data`(Union[List[List[str]], str]): 在非交互模式中，数据类型为List[List[str]]，每个样本是一个List[str]，表示为对话内容
+- `max_seq_len`(int): 每个样本的最大文本长度
+- `batch_size`(int): 进行预测的batch_size
+- `use_gpu`(bool): 是否使用gpu执行预测
+- `kwargs`: 预测时传给模型的额外参数，以keyword方式传递。其余的参数详情请查看[UnifiedTransformer](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/dialogue/unified_transformer)。
+
+**返回**
+* `results`(List[str]): 每个元素为相应对话中模型的新回复
+
+```python
+def interactive_mode(max_turn=3)
+```
+进入交互模式。交互模式中，predict接口的data将支持字符串类型。
+
+**参数**
+- `max_turn`(int): 模型能记忆的对话轮次，当`max_turn`为1时，模型只能记住当前对话，无法获知之前的对话内容。
+
+
+**代码示例**
+
+```python
+# 非交互模式
+import paddlehub as hub
+
+model = hub.Module(name='unified_transformer_12L_cn')
+data = [["你是谁？"], ["你好啊。", "吃饭了吗？",]]
+result = model.predict(data)
+```
+
+```python
+# 交互模式
+import paddlehub as hub
+
+model = hub.Module(name='unified_transformer_12L_cn')
+with model.interactive_mode(max_turn=3):
+    while True:
+        human_utterance = input("[Human]: ").strip()
+        robot_utterance = model.predict(human_utterance)[0]
+        print("[Bot]: %s"%robot_utterance)
+```
+
+## 服务部署
+
+PaddleHub Serving可以部署在线服务。
+
+### Step1: 启动PaddleHub Serving
+
+运行启动命令：
+
+```shell
+$ hub serving start -m unified_transformer_12L_cn
+```
+
+这样就完成了一个对话机器人服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA_VISIBLE_DEVICES环境变量，否则不用设置。
+
+### Step2: 发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+texts = [["今天是个好日子"], ["天气预报说今天要下雨"]]
+data = {"data": texts}
+# 发送post请求，content-type类型应指定json方式，url中的ip地址需改为对应机器的ip
+url = "http://127.0.0.1:8866/predict/unified_transformer_12L_cn"
+# 指定post请求的headers为application/json方式
+headers = {"Content-Type": "application/json"}
+
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(r.json())
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Knover
+
+## 依赖
+
+paddlepaddle >= 2.0.0
+
+paddlehub >= 2.1.0
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/modules/text/text_generation/unified_transformer-12L-cn/__init__.py
+++ b/modules/text/text_generation/unified_transformer-12L-cn/__init__.py
--- a/modules/text/text_generation/unified_transformer-12L-cn/module.py
+++ b/modules/text/text_generation/unified_transformer-12L-cn/module.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from collections import deque
+from typing import List, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddlehub.module.module import moduleinfo, serving
+from paddlenlp.data import Pad
+from paddlenlp.transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
+
+from unified_transformer_12L_cn.utils import select_response
+
+
+@moduleinfo(
+    name="unified_transformer_12L_cn",
+    version="1.0.0",
+    summary="",
+    author="PaddlePaddle",
+    author_email="",
+    type="nlp/text_generation",
+)
+class UnifiedTransformer(nn.Layer):
+    def __init__(self):
+        super(UnifiedTransformer, self).__init__()
+
+        self.model = UnifiedTransformerLMHeadModel.from_pretrained('unified_transformer-12L-cn')
+        self.tokenizer = UnifiedTransformerTokenizer.from_pretrained('unified_transformer-12L-cn')
+        self._interactive_mode = False
+
+    def _convert_text_to_input(self, texts: List[str], max_seq_len: int):
+        """
+        Convert input strings to tokens.
+        """
+        return self.tokenizer.dialogue_encode(texts,
+                                              max_seq_len=max_seq_len,
+                                              add_start_token_as_response=True,
+                                              is_split_into_words=False)
+
+    def _batchify(self, data: List[List[str]], max_seq_len: int, batch_size: int):
+        """
+        Generate input batches.
+        """
+        padding = False if batch_size == 1 else True
+        pad_func = Pad(pad_val=self.tokenizer.pad_token_id, pad_right=False)
+
+        def pad_mask(batch_attention_mask):
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype='float32') * -1e9
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype='float32')
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        def _parse_batch(batch_examples):
+            if padding:
+                input_ids = pad_func([example['input_ids'] for example in batch_examples])
+                token_type_ids = pad_func([example['token_type_ids'] for example in batch_examples])
+                position_ids = pad_func([example['position_ids'] for example in batch_examples])
+                attention_mask = pad_mask([example['attention_mask'] for example in batch_examples])
+            else:
+                input_ids = np.asarray([example['input_ids'] for example in batch_examples])
+                token_type_ids = np.asarray([example['token_type_ids'] for example in batch_examples])
+                position_ids = np.asarray([example['position_ids'] for example in batch_examples])
+                attention_mask = np.asarray([example['attention_mask'] for example in batch_examples])
+                attention_mask = np.expand_dims(attention_mask, 0)
+
+            return input_ids, token_type_ids, position_ids, attention_mask
+
+        examples = []
+        for texts in data:
+            examples.append(self._convert_text_to_input(texts, max_seq_len))
+
+        # Seperates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    @contextlib.contextmanager
+    def interactive_mode(self, max_turn=3):
+        """
+        Enter the interactive mode.
+        """
+        self._interactive_mode = True
+        self.max_turn = max_turn
+        self.context = deque(maxlen=self.max_turn)
+        yield
+        self.context.clear()
+        self._interactive_mode = False
+
+    def forward(self,
+                input_ids,
+                token_type_ids,
+                position_ids,
+                attention_mask,
+                max_length=64,
+                min_length=1,
+                decode_strategy='sampling',
+                temperature=1.0,
+                top_k=5,
+                top_p=1.0,
+                num_beams=0,
+                length_penalty=1.0,
+                early_stopping=False,
+                num_return_sequences=1):
+
+        ids, scores = self.model.generate(input_ids=input_ids,
+                                          token_type_ids=token_type_ids,
+                                          position_ids=position_ids,
+                                          attention_mask=attention_mask,
+                                          max_length=max_length,
+                                          min_length=min_length,
+                                          decode_strategy=decode_strategy,
+                                          temperature=temperature,
+                                          top_k=top_k,
+                                          top_p=top_p,
+                                          num_beams=num_beams,
+                                          length_penalty=length_penalty,
+                                          early_stopping=early_stopping,
+                                          num_return_sequences=num_return_sequences)
+
+        return ids, scores
+
+    @serving
+    def predict(self,
+                data: Union[List[List[str]], str],
+                max_seq_len: int = 512,
+                batch_size: int = 1,
+                use_gpu: bool = False,
+                **kwargs):
+
+        if self._interactive_mode:
+            if isinstance(data, str):
+                self.context.append(data.strip())
+                data = [list(self.context)]
+            else:
+                raise ValueError("In the interactive mode, the input data should be a string.")
+        elif not isinstance(data, list):
+            raise ValueError("If not in the interactive mode, the input data should be a list.")
+
+        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
+
+        batches = self._batchify(data, max_seq_len, batch_size)
+
+        results = []
+        self.eval()
+        for batch in batches:
+            input_ids, token_type_ids, position_ids, attention_mask = map(paddle.to_tensor, batch)
+            ids, scores = self(input_ids, token_type_ids, position_ids, attention_mask, **kwargs)
+            num_return_sequences = 1 if 'num_return_sequences' not in kwargs\
+                else kwargs['num_return_sequences']
+            results.extend(
+                select_response(ids,
+                                scores,
+                                self.tokenizer,
+                                num_return_sequences=num_return_sequences,
+                                keep_space=False))
+
+        if self._interactive_mode:
+            self.context.append(results[0].strip())
+
+        return results
--- a/modules/text/text_generation/unified_transformer-12L-cn/requirements.txt
+++ b/modules/text/text_generation/unified_transformer-12L-cn/requirements.txt
+sentencepiece
--- a/modules/text/text_generation/unified_transformer-12L-cn/utils.py
+++ b/modules/text/text_generation/unified_transformer-12L-cn/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+
+def post_process_response(token_ids: List[int], tokenizer):
+    '''
+    Post-process the decoded sequence. Truncate from the first <eos>.
+    '''
+    eos_pos = len(token_ids)
+    for i, tok_id in enumerate(token_ids):
+        if tok_id == tokenizer.sep_token_id:
+            eos_pos = i
+            break
+    token_ids = token_ids[:eos_pos]
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    tokens = tokenizer.merge_subword(tokens)
+    return token_ids, tokens
+
+
+def get_in_turn_repetition(pred: List[str], is_cn: bool = False):
+    '''
+    Get in-turn repetition.
+    '''
+    if len(pred) == 0:
+        return 1.0
+    if isinstance(pred[0], str):
+        pred = [tok.lower() for tok in pred]
+        if is_cn:
+            pred = "".join(pred)
+    tri_grams = set()
+    for i in range(len(pred) - 2):
+        tri_gram = tuple(pred[i:i + 3])
+        if tri_gram in tri_grams:
+            return True
+        tri_grams.add(tri_gram)
+    return False
+
+
+def select_response(ids,
+                    scores: List[float],
+                    tokenizer,
+                    max_dec_len: int = None,
+                    num_return_sequences: int = 1,
+                    keep_space: bool = True):
+    '''
+    Select response with the highest score.
+    '''
+    ids = ids.numpy().tolist()
+    scores = scores.numpy()
+
+    if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+        raise ValueError("the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+            len(ids), num_return_sequences))
+
+    group = []
+    tmp = []
+    for pred, score in zip(ids, scores):
+        pred_token_ids, pred_tokens = post_process_response(pred, tokenizer)
+        num_token = len(pred_token_ids)
+        if keep_space:
+            response = " ".join(pred_tokens)
+        else:
+            response = "".join(pred_tokens)
+
+        in_turn_repetition = get_in_turn_repetition(pred_tokens, True) or get_in_turn_repetition(pred_token_ids)
+        # not ending
+        if max_dec_len is not None and num_token >= max_dec_len:
+            score -= 1e3
+        elif in_turn_repetition:
+            score -= 1e3
+
+        tmp.append([response, score])
+        if len(tmp) == num_return_sequences:
+            group.append(tmp)
+            tmp = []
+
+    results = []
+    for preds in group:
+        preds = sorted(preds, key=lambda x: -x[1])
+        results.append(preds[0][0])
+    return results