From 524a75122f9d2585e1a74d039cd9cc1ecb70a3e9 Mon Sep 17 00:00:00 2001
From: jm12138 <2286040843@qq.com>
Date: Thu, 29 Dec 2022 10:08:23 +0800
Subject: [PATCH] update plato2_en_large (#2114)
* update plato2_en_large
* update README
---
.../text_generation/plato2_en_large/README.md | 196 ++++----
.../plato2_en_large/__init__.py | 0
.../text_generation/plato2_en_large/model.py | 451 ++++++++++++++++++
.../plato2_en_large/models/__init__.py | 65 ---
.../plato2_en_large/models/generator.py | 268 -----------
.../plato2_en_large/models/model_base.py | 288 -----------
.../plato2_en_large/models/nsp_model.py | 107 -----
.../plato2_en_large/models/optimizer.py | 37 --
.../plato2_en_large/models/plato.py | 241 ----------
.../models/transformer_block.py | 332 -------------
.../models/unified_transformer.py | 378 ---------------
.../text_generation/plato2_en_large/module.py | 125 ++---
.../plato2_en_large/readers/dialog_reader.py | 130 +++--
.../plato2_en_large/readers/nsp_reader.py | 74 ++-
.../plato2_en_large/readers/plato_reader.py | 36 +-
.../plato2_en_large/tasks/__init__.py | 61 ---
.../tasks/dialog_generation.py | 292 ------------
.../tasks/next_sentence_prediction.py | 44 --
.../plato2_en_large/tasks/task_base.py | 86 ----
.../plato2_en_large/utils/__init__.py | 179 ++-----
.../plato2_en_large/utils/args.py | 3 -
.../plato2_en_large/utils/inference.py | 42 --
.../plato2_en_large/utils/masking.py | 31 +-
.../plato2_en_large/utils/tokenization.py | 15 +-
24 files changed, 826 insertions(+), 2655 deletions(-)
delete mode 100644 modules/text/text_generation/plato2_en_large/__init__.py
create mode 100644 modules/text/text_generation/plato2_en_large/model.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/__init__.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/generator.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/model_base.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/nsp_model.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/optimizer.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/plato.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/transformer_block.py
delete mode 100644 modules/text/text_generation/plato2_en_large/models/unified_transformer.py
delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/__init__.py
delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py
delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py
delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/task_base.py
delete mode 100644 modules/text/text_generation/plato2_en_large/utils/inference.py
diff --git a/modules/text/text_generation/plato2_en_large/README.md b/modules/text/text_generation/plato2_en_large/README.md
index 892212ba..529024cd 100644
--- a/modules/text/text_generation/plato2_en_large/README.md
+++ b/modules/text/text_generation/plato2_en_large/README.md
@@ -1,154 +1,122 @@
-## 概述
+# plato2_en_large
-PLATO2是一个超大规模生成式对话系统模型。它承袭了PLATO隐变量进行回复多样化生成的特性,能够就开放域话题进行流畅深入的聊天。据公开数据,其效果超越了Google 于2020年2月份发布的 Meena和Facebook AI Research于2020年4月份发布的Blender的效果。plato2_en_large包含1.6B参数,可用于一键预测对话回复,该Module仅支持使用GPU预测,不支持CPU。
-
-
-
+| 模型名称 | plato2_en_large |
+| :------------------ | :--------------------: |
+| 类别 | 文本-文本生成 |
+| 网络 | PLATO2 |
+| 数据集 | 大规模开放域英文数据集 |
+| 是否支持Fine-tuning | 否 |
+| 模型大小 | 19.3 GB |
+| 最新更新日期 | 2022-11-05 |
+| 数据指标 | - |
-更多详情参考论文[PLATO-2: Towards Building an Open-Domain Chatbot via Curriculum Learning](https://arxiv.org/abs/2006.16779)
+## 一、模型基本信息
-**注:plato2\_en\_large 模型大小12GB,下载时间较长,请耐心等候。运行此模型要求显存至少16GB。**
+- ### 模型介绍
+ - PLATO2 是一个超大规模生成式对话系统模型。它承袭了 PLATO 隐变量进行回复多样化生成的特性,能够就开放域话题进行流畅深入的聊天。据公开数据,其效果超越了 Google 于 2020 年 2 月份发布的 Meena 和 Facebook AI Research 于2020 年 4 月份发布的 Blender 的效果。plato2_en_large 包含 1.6B 参数,可用于一键预测对话回复。由于该 Module 参数量较多,推荐使用GPU预测。
-## 命令行预测
-```shell
-$ hub run plato2_en_large --input_text="Hello, how are you"
-```
+## 二、安装
-## API
+- ### 1、环境依赖
-```python
-def generate(texts):
-```
+ - paddlepaddle >= 2.0.0
+ - paddlehub >= 2.1.0 | [如何安装PaddleHub](../../../../docs/docs_ch/get_start/installation.rst)
-预测API,输入对话上下文,输出机器回复。
+- ### 2、安装
-**参数**
+ - ```shell
+ $ hub install plato2_en_large
+ ```
+ - 如您安装时遇到问题,可参考:[零基础windows安装](../../../../docs/docs_ch/get_start/windows_quickstart.md)
+ | [零基础Linux安装](../../../../docs/docs_ch/get_start/linux_quickstart.md) | [零基础MacOS安装](../../../../docs/docs_ch/get_start/mac_quickstart.md)
-* texts (list\[str\] or str): 如果不在交互模式中,texts应为list,每个元素为一次对话的上下文,上下文应包含人类和机器人的对话内容,不同角色之间的聊天用分隔符"\t"进行分割;例如[["Hello\thi, nice to meet you\tnice to meet you"]]。这个输入中包含1次对话,机器人回复了"hi, nice to meet you"后人类回复“nice to meet you”,现在轮到机器人回复了。如果在交互模式中,texts应为str,模型将自动构建它的上下文。
+## 三、模型API预测
-**返回**
+- ### 1、命令行预测
-* results (list\[str\]): 每个元素为相应对话中机器人的新回复。
+ - ```bash
+ $ hub run plato2_en_large --input_text="Hello, how are you"
+ ```
-**代码示例**
+- ### 2、预测代码示例
-```python
-import paddlehub as hub
+ - ```python
+ import paddlehub as hub
-module = hub.Module(name="plato2_en_large")
+ module = hub.Module(name="plato2_en_large")
-test_texts = ["Hello","Hello\thi, nice to meet you\tnice to meet you"]
-results = module.generate(texts=test_texts)
-for result in results:
- print(result)
-```
+ test_texts = ["Hello","Hello\thi, nice to meet you\tnice to meet you"]
+ results = module.generate(texts=test_texts)
+ for result in results:
+ print(result)
+ ```
-```python
-def interactive_mode(max_turn =6):
-```
+- ### 3、API
-进入交互模式。交互模式中,generate接口的texts将支持字符串类型。
+ - ```python
+ def generate(texts):
+ ```
-**参数**
+ - 预测API,输入对话上下文,输出机器回复。
+ - **参数**
+ - texts (list\[str\] or str): 如果不在交互模式中,texts应为list,每个元素为一次对话的上下文,上下文应包含人类和机器人的对话内容,不同角色之间的聊天用分隔符"\t"进行分割;例如[["Hello\thi, nice to meet you\tnice to meet you"]]。这个输入中包含1次对话,机器人回复了"hi, nice to meet you"后人类回复“nice to meet you”,现在轮到机器人回复了。如果在交互模式中,texts应为str,模型将自动构建它的上下文。
-* max_turn (int): 模型能记忆的对话轮次,当max_turn = 1时,模型只能记住当前对话,无法获知之前的对话内容。
+ - ```python
+ def interactive_mode(max_turn=6):
+ ```
-**代码示例**
+ - 进入交互模式。交互模式中,generate接口的texts将支持字符串类型。
+ - **参数**
+ - max_turn (int): 模型能记忆的对话轮次,当max_turn = 1时,模型只能记住当前对话,无法获知之前的对话内容。
-```python
-import paddlehub as hub
-module = hub.Module(name="plato2_en_large")
+## 四、服务部署
-with module.interactive_mode(max_turn=6):
- while True:
- human_utterance = input("[Human]: ").strip()
- robot_utterance = module.generate(human_utterance)
- print("[Bot]: %s"%robot_utterance[0])
-```
+- PaddleHub Serving可以部署一个在线对话机器人服务。
-## 服务部署
+- ### 第一步:启动PaddleHub Serving
-PaddleHub Serving 可以部署在线服务。
+ - 运行启动命令:
+ - ```shell
+ $ hub serving start -m plato2_en_large -p 8866
+ ```
-### 第一步:启动PaddleHub Serving
+ - 这样就完成了一个对话机器人服务化API的部署,默认端口号为8866。
+ - **NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。
-运行启动命令:
-```shell
-$ hub serving start -m plato2_en_large -p 8866
-```
-这样就完成了一个服务化API的部署,默认端口号为8866。
+- ### 第二步:发送预测请求
-**NOTE:** 在启动服务之前,请设置CUDA\_VISIBLE\_DEVICES环境变量。
+ - 配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
-### 第二步:发送预测请求
+ - ```python
+ import requests
+ import json
-方式1: 自定义脚本发送对话信息
+ data = {'texts':["Hello","Hello\thi, nice to meet you\tnice to meet you"]}
+ headers = {"Content-type": "application/json"}
+ url = "http://127.0.0.1:8866/predict/plato2_en_large"
+ r = requests.post(url=url, headers=headers, data=json.dumps(data))
-配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+ # 保存结果
+ results = r.json()["results"]
+ for result in results:
+ print(result)
+ ```
-```python
-import requests
-import json
+ - 关于PaddleHub Serving更多信息参考[服务部署](../../../../docs/docs_ch/tutorial/serving.md)
-# 发送HTTP请求
+## 五、更新历史
-data = {'texts':["Hello","Hello\thi, nice to meet you\tnice to meet you"]}
-headers = {"Content-type": "application/json"}
-url = "http://127.0.0.1:8866/predict/plato2_en_large"
-r = requests.post(url=url, headers=headers, data=json.dumps(data))
-
-# 保存结果
-results = r.json()["results"]
-for result in results:
- print(result)
-```
-
-方式2: 通过交互式客户端进入交互模式
-
-您可以执行以下客户端脚本进入交互模式:
-
-```python
-import requests
-import json
-
-ADDR = "127.0.0.1" # Your serving address
-PORT = 8866 # Your serving port
-MAX_TURN = 6
-
-headers = {"Content-type": "application/json"}
-url = "http://%s:%s/predict/plato2_en_large" % (ADDR, PORT)
-
-context = []
-while True:
- user_utt = input("[Human]: ").strip()
- if user_utt == "[NEXT]":
- context = ""
- print("Restart")
- else:
- context.append(user_utt)
- data = {'texts': ["\t".join(context[-MAX_TURN:])]}
- r = requests.post(url=url, headers=headers, data=json.dumps(data))
- bot_response = r.json()["results"][0]
- print("[Bot]: %s"%bot_response)
- context.append(bot_response)
-```
-
-## 查看代码
-
-https://github.com/PaddlePaddle/Knover
-
-### 依赖
-
-1.8.2 <= paddlepaddle < 2.0.0
-
-1.7.0 <= paddlehub < 2.0.0
+* 1.0.0
+ 初始发布
-## 更新历史
+* 1.1.0
-* 1.0.0
+ 移除 Fluid API
- 初始发布
+ - ```shell
+ $ hub install plato==1.1.0
+ ```
diff --git a/modules/text/text_generation/plato2_en_large/__init__.py b/modules/text/text_generation/plato2_en_large/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/modules/text/text_generation/plato2_en_large/model.py b/modules/text/text_generation/plato2_en_large/model.py
new file mode 100644
index 00000000..0b322a7a
--- /dev/null
+++ b/modules/text/text_generation/plato2_en_large/model.py
@@ -0,0 +1,451 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import namedtuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def post_process_context(token_ids, reader, merge=True):
+ """Post-process the context sequence."""
+ context = []
+ utt = []
+ for tok_id in token_ids[1:]:
+ if tok_id == reader.eos_id:
+ utt = reader.tokenizer.convert_ids_to_tokens(utt)
+ if merge:
+ utt = reader.tokenizer.merge_subword(utt)
+ context.append(utt)
+ utt = []
+ else:
+ utt.append(tok_id)
+ return context
+
+
+def post_process_response(token_ids, reader, merge=True):
+ """
+ Post-process the decoded sequence. Truncate from the first
+ and remove the and tokens currently.
+ """
+ eos_pos = len(token_ids)
+ for i, tok_id in enumerate(token_ids):
+ if tok_id == reader.eos_id:
+ eos_pos = i
+ break
+ token_ids = token_ids[1:eos_pos]
+ response = reader.tokenizer.convert_ids_to_tokens(token_ids)
+ if merge:
+ response = reader.tokenizer.merge_subword(response)
+ return token_ids, response
+
+
+def get_cross_turn_repetition(context, pred_tokens, eos_idx, is_cn=False):
+ """Get cross-turn repetition."""
+ if len(pred_tokens) == 0:
+ return 1.0
+ if is_cn:
+ context = ["".join(utt) for utt in context]
+ pred_tokens = "".join(pred_tokens)
+
+ pred_tri_grams = set()
+ for i in range(len(pred_tokens) - 2):
+ tri_gram = tuple(pred_tokens[i:i + 3])
+ pred_tri_grams.add(tri_gram)
+ for utt in context:
+ for i in range(len(utt) - 2):
+ tri_gram = tuple(utt[i:i + 3])
+ if tri_gram in pred_tri_grams:
+ return 1.0
+ return 0.0
+
+
+def get_in_turn_repetition(pred, is_cn=False):
+ """Get in-turn repetition."""
+ if len(pred) == 0:
+ return 1.0
+ if isinstance(pred[0], str):
+ pred = [tok.lower() for tok in pred]
+ if is_cn:
+ pred = "".join(pred)
+ tri_grams = set()
+ for i in range(len(pred) - 2):
+ tri_gram = tuple(pred[i:i + 3])
+ if tri_gram in tri_grams:
+ return 1.0
+ tri_grams.add(tri_gram)
+ return 0.0
+
+
+class Plato2EncoderLayer(nn.Layer):
+
+ def __init__(self, n_head, hidden_size, attn_dropout, act_dropout):
+ super(Plato2EncoderLayer, self).__init__()
+
+ self.self_attn = nn.MultiHeadAttention(hidden_size, n_head, attn_dropout)
+ self.pre_norm_layer = nn.LayerNorm(hidden_size)
+ self.post_norm_layer = nn.LayerNorm(hidden_size)
+ self.fc1 = nn.Linear(hidden_size, hidden_size * 4)
+ self.fc2 = nn.Linear(hidden_size * 4, hidden_size)
+
+ self.dropout_layer = nn.Dropout(act_dropout)
+ self.gelu_layer = nn.GELU()
+
+ def forward(self, x, attn_mask, cache):
+ query = self.pre_norm_layer(x)
+ attn_output, new_cache = self.self_attn(query, None, None, attn_mask, cache)
+ attn_output = self.dropout_layer(attn_output)
+ attn_output = attn_output + x
+ ffd_input = self.post_norm_layer(attn_output)
+
+ ffd_output = self.fc1(ffd_input)
+ ffd_output = self.gelu_layer(ffd_output)
+ ffd_output = self.dropout_layer(ffd_output)
+
+ ffd_output = self.fc2(ffd_output)
+ ffd_output = self.dropout_layer(ffd_output)
+ out = ffd_output + attn_output
+
+ return out, new_cache
+
+ def gen_cache(self, key):
+ return self.self_attn.gen_cache(key)
+
+
+class Plato2Encoder(nn.Layer):
+
+ def __init__(self, vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, attn_dropout,
+ act_dropout):
+ super(Plato2Encoder, self).__init__()
+
+ self.n_head = n_head
+
+ self.word_embedding_layer = nn.Embedding(vocab_size, hidden_size)
+ self.sent_embedding_layer = nn.Embedding(type_size, hidden_size)
+ self.pos_embedding_layer = nn.Embedding(max_position_seq_len, hidden_size)
+
+ self.encoder_layers = []
+ for i in range(num_layers):
+ encoder_layer = Plato2EncoderLayer(n_head, hidden_size, attn_dropout, act_dropout)
+ self.encoder_layers.append(encoder_layer)
+ self.add_sublayer('layers.' + str(i), encoder_layer)
+ self.post_encoder_layer_norm = nn.LayerNorm(hidden_size)
+
+ self.dropout_layer = nn.Dropout(act_dropout)
+
+ def forward(self, caches, token_ids, type_ids, pos_ids, generation_mask, aux_emb=None):
+ out, self_attn_mask = self.gen_input(token_ids, type_ids, pos_ids, generation_mask, aux_emb)
+
+ new_caches = []
+ for i, encoder_layer in enumerate(self.encoder_layers):
+ out, new_cache = encoder_layer(out, self_attn_mask, caches[i])
+ new_caches.append(new_cache)
+
+ enc_output = self.post_encoder_layer_norm(out)
+ return enc_output, new_caches
+
+ def gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None):
+ token_emb_out = self.word_embedding_layer(token_ids)
+ type_emb_out = self.sent_embedding_layer(type_ids)
+ pos_emb_out = self.pos_embedding_layer(pos_ids)
+ emb_out = token_emb_out + type_emb_out + pos_emb_out
+
+ # auxiliary memory embeddings
+ if aux_emb is not None:
+ emb_out = paddle.concat([aux_emb, emb_out], axis=1)
+
+ emb_out = self.dropout_layer(emb_out)
+
+ # generate n-head self-attention mask
+ self_attn_mask = input_mask
+ self_attn_mask = paddle.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False)
+ n_head_self_attn_mask = paddle.stack(x=[self_attn_mask] * self.n_head, axis=1)
+ n_head_self_attn_mask.stop_gradient = True
+
+ return emb_out, n_head_self_attn_mask
+
+ def gen_caches(self, key):
+ caches = [encoder_layer.gen_cache(key) for encoder_layer in self.encoder_layers]
+ return caches
+
+
+class NSP(nn.Layer):
+
+ def __init__(self, vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, attn_dropout,
+ act_dropout):
+ super(NSP, self).__init__()
+
+ self.n_head = n_head
+ self.hidden_size = hidden_size
+
+ self.word_embedding_layer = nn.Embedding(vocab_size, hidden_size)
+ self.sent_embedding_layer = nn.Embedding(type_size, hidden_size)
+ self.pos_embedding_layer = nn.Embedding(max_position_seq_len, hidden_size)
+
+ encoder_layer = nn.TransformerEncoderLayer(hidden_size, n_head, hidden_size * 4, act_dropout, 'gelu',
+ attn_dropout, act_dropout, 'True')
+ encoder_norm = nn.LayerNorm(hidden_size)
+ self.encoder = nn.TransformerEncoder(encoder_layer, num_layers, encoder_norm)
+ self.fc1 = nn.Linear(hidden_size, hidden_size)
+ self.fc2 = nn.Linear(hidden_size, 2)
+
+ self.dropout_layer = nn.Dropout(act_dropout)
+ self.tanh_layer = nn.Tanh()
+ self.softmax = nn.Softmax()
+
+ def forward(self, inputs):
+ token_ids = inputs['token_ids']
+ type_ids = inputs['type_ids']
+ pos_ids = inputs['pos_ids']
+ attention_mask = inputs['attention_mask']
+ label_pos = inputs["label_pos"]
+
+ out, self_attn_mask = self.gen_input(token_ids, type_ids, pos_ids, attention_mask)
+ # [-1, seq_len, hidden_size]
+ enc_out = self.encoder(out, self_attn_mask)
+
+ enc_out = paddle.reshape(enc_out, [-1, self.hidden_size])
+ label_pos = paddle.cast(label_pos, 'int64')
+ out = paddle.gather(enc_out, label_pos)
+ pooled_out = self.fc1(out)
+ pooled_out = self.tanh_layer(pooled_out)
+
+ # [-1, 2]
+ logits = self.fc2(pooled_out)
+ probs = self.softmax(logits)
+
+ return probs
+
+ def gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None):
+ token_emb_out = self.word_embedding_layer(token_ids)
+ type_emb_out = self.sent_embedding_layer(type_ids)
+ pos_emb_out = self.pos_embedding_layer(pos_ids)
+ emb_out = token_emb_out + type_emb_out + pos_emb_out
+
+ # auxiliary memory embeddings
+ if aux_emb is not None:
+ emb_out = paddle.concat([aux_emb, emb_out], axis=1)
+
+ emb_out = self.dropout_layer(emb_out)
+
+ # generate n-head self-attention mask
+ self_attn_mask = input_mask
+ self_attn_mask = paddle.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False)
+ n_head_self_attn_mask = paddle.stack(x=[self_attn_mask] * self.n_head, axis=1)
+ n_head_self_attn_mask.stop_gradient = True
+
+ return emb_out, n_head_self_attn_mask
+
+
+class Plato2InferModel(nn.Layer):
+
+ def __init__(self,
+ nsp_reader,
+ num_layers,
+ n_head,
+ hidden_size,
+ vocab_size=8001,
+ type_size=2,
+ latent_type_size=20,
+ max_position_seq_len=256,
+ act_dropout=0.1,
+ attn_dropout=0.1,
+ max_dec_len=64,
+ min_dec_len=1,
+ topk=10):
+ super(Plato2InferModel, self).__init__()
+
+ self.nsp_reader = nsp_reader
+ self.num_layers = num_layers
+ self.latent_type_size = latent_type_size
+ self.max_dec_len = max_dec_len
+ self.min_dec_len = min_dec_len
+ self.topk = topk
+ self.unk_id = 0
+ self.bos_id = 1
+ self.eos_id = 2
+ self.mask_id = 8000
+ self.after_eos = paddle.ones([vocab_size]) * -1e9
+ self.after_eos[self.eos_id] = 0
+ self.is_cn = False
+ self.batch_size = 1
+
+ self.latent_weight = paddle.create_parameter([hidden_size, latent_type_size], 'float32')
+
+ self.plato2_encoder = Plato2Encoder(vocab_size, type_size, max_position_seq_len, num_layers, n_head,
+ hidden_size, attn_dropout, act_dropout)
+
+ self.logits_fc_layer = nn.Linear(hidden_size, hidden_size)
+ self.logits_layer_norm = nn.LayerNorm(hidden_size)
+ self.logits_bias = paddle.create_parameter([vocab_size], 'float32', is_bias=True)
+
+ self.nsp_predictor = NSP(vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size,
+ attn_dropout, act_dropout)
+
+ self.gelu_layer = nn.GELU()
+ self.softmax = nn.Softmax()
+
+ @paddle.no_grad()
+ def forward(self, inputs):
+ token_ids = inputs['token_ids']
+ type_ids = inputs['type_ids']
+ pos_ids = inputs['pos_ids']
+ generation_mask = inputs['generation_mask']
+ latent_id = inputs['latent_id']
+ data_id = inputs['data_id']
+
+ # [-1, 1, latent_type_size]
+ latent_id = F.one_hot(latent_id, self.latent_type_size)
+ # [-1, 1, hidden_size]
+ latent_emb = paddle.matmul(latent_id, self.latent_weight, transpose_y=True)
+
+ caches = self.plato2_encoder.gen_caches(token_ids)
+
+ # [-1, seq_len + 1, hidden_size]
+ enc_out, new_caches = self.plato2_encoder(caches, token_ids, type_ids, pos_ids, generation_mask, latent_emb)
+
+ pred_ids = self.decode(inputs, new_caches)
+
+ nsp_inputs = self.gen_nsp_input(token_ids, pred_ids)
+ # [-1, 2]
+ probs = self.nsp_predictor(nsp_inputs)
+
+ return self.get_results(data_id, token_ids, pred_ids, probs)
+
+ def decode(self, inputs, caches):
+ tgt_ids = inputs['tgt_ids']
+ tgt_pos = inputs['tgt_pos']
+ tgt_generation_mask = inputs['tgt_generation_mask']
+ predictions = tgt_ids
+
+ # TODO
+ step = 0
+ while step < self.max_dec_len:
+ # [-1, 1]
+ append_mask = paddle.cast(tgt_ids != self.eos_id, dtype=tgt_generation_mask.dtype)
+ tgt_generation_mask = paddle.concat([tgt_generation_mask, paddle.unsqueeze(append_mask, 1)], axis=-1)
+ tgt_sent = paddle.ones([tgt_generation_mask.shape[0], 1], dtype=tgt_ids.dtype)
+
+ # [-1, 1, hidden_size]
+ out, caches = self.plato2_encoder(caches, tgt_ids, tgt_sent, tgt_pos, tgt_generation_mask)
+ out = paddle.squeeze(out, axis=1)
+
+ # [-1, hidden_size]
+ trans = self.logits_fc_layer(out)
+ trans = self.gelu_layer(trans)
+ trans = self.logits_layer_norm(trans)
+
+ # [-1, vocab_size]
+ logits = paddle.matmul(trans, self.plato2_encoder.word_embedding_layer.weight,
+ transpose_y=True) + self.logits_bias
+ logits[:, self.unk_id] = -1e9
+ logits[:, self.bos_id] = -1e9
+ logits[:, self.mask_id] = -1e9
+ if step < self.min_dec_len:
+ logits[:, self.eos_id] = -1e9
+ logits = logits * append_mask + (1 - append_mask) * self.after_eos
+ probs = self.softmax(logits)
+
+ # [-1, topk]
+ topk_probs, _ = paddle.topk(probs, k=self.topk)
+ mask = paddle.cast(probs >= topk_probs[:, -1:], 'float32')
+ sums = paddle.sum(topk_probs, axis=-1, keepdim=True)
+ new_probs = probs * mask / sums
+ # [-1, 1]
+ sampling_ids = paddle.multinomial(new_probs)
+
+ step = step + 1
+ tgt_ids = sampling_ids
+ tgt_pos = tgt_pos + 1
+ predictions = paddle.concat([predictions, tgt_ids], axis=1)
+ return predictions
+
+ def gen_nsp_input(self, token_ids, pred_ids):
+ token_ids = token_ids.numpy()
+ pred_ids = pred_ids.numpy()
+
+ def __reader__():
+ headers = ["src", "tgt", "data_id"]
+
+ Example = namedtuple("Example", headers)
+
+ for i, (raw, pred) in enumerate(zip(token_ids, pred_ids)):
+ context = post_process_context(raw, self.nsp_reader, merge=False)
+ _, response = post_process_response(pred, self.nsp_reader, merge=False)
+ context_tokenized_input = " [SEP] ".join(" ".join(utt) for utt in context)
+ response_tokenized_input = " ".join(response)
+ example = Example(src=context_tokenized_input, tgt=response_tokenized_input, data_id=i)
+ data = self.nsp_reader._convert_example_to_record(example, is_infer=True)
+ yield data
+ return
+
+ generator = self.nsp_reader.data_generator(
+ reader=__reader__,
+ is_infer=True,
+ phase="test",
+ )
+ inputs = next(generator())
+
+ #print('\nnsp_inputs:')
+ for key in inputs:
+ inputs[key] = paddle.to_tensor(inputs[key])
+ if key in ['token_ids', 'type_ids', 'pos_ids']:
+ inputs[key] = paddle.squeeze(inputs[key], axis=-1)
+ #print(key, inputs[key].shape)
+ #print(inputs[key])
+ return inputs
+
+ def get_results(self, data_id, token_ids, pred_ids, probs):
+ data_id = data_id.numpy()
+ token_ids = token_ids.numpy()
+ pred_ids = pred_ids.numpy()
+ probs = probs.numpy()
+
+ infos = []
+ for raw, pred, prob in zip(token_ids, pred_ids, probs):
+ tokens = post_process_context(raw, self.nsp_reader)
+ pred_token_ids, pred_tokens = post_process_response(pred, self.nsp_reader)
+ info = {}
+ info['response'] = ' '.join(pred_tokens)
+ cross_turn_repetition = get_cross_turn_repetition(tokens, pred_tokens, self.nsp_reader.eos_id, self.is_cn)
+ in_turn_repetition = max(get_in_turn_repetition(pred_tokens, self.is_cn),
+ get_in_turn_repetition(pred_token_ids))
+
+ info['score'] = float(prob[1])
+ if len(pred_token_ids) >= self.max_dec_len:
+ info['score'] -= 1e3
+ elif cross_turn_repetition > 0:
+ info['score'] -= 1e3
+ elif in_turn_repetition > 0:
+ info['score'] -= 1e3
+ infos.append(info)
+
+ results = []
+ pre_idx = 0
+ sample = []
+ for idx, info in zip(data_id, infos):
+ if idx != pre_idx:
+ sample = sorted(sample, key=lambda info: -info["score"])
+ result = sample[0]
+ result['data_id'] = pre_idx
+ results.apeend(result)
+ sample = []
+ pre_idx = idx
+ sample.append(info)
+ if sample:
+ sample = sorted(sample, key=lambda info: -info["score"])
+ result = sample[0]
+ result['data_id'] = pre_idx
+ results.append(result)
+ return results
diff --git a/modules/text/text_generation/plato2_en_large/models/__init__.py b/modules/text/text_generation/plato2_en_large/models/__init__.py
deleted file mode 100644
index 353c85a2..00000000
--- a/modules/text/text_generation/plato2_en_large/models/__init__.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Define model."""
-
-from plato2_en_large.models.model_base import Model
-
-MODEL_REGISTRY = {}
-
-__all__ = ["MODEL_REGISTRY", "register_model", "create_model", "add_cmdline_args"]
-
-
-def register_model(name):
- """
- Register a new model class.
- """
-
- def __wrapped__(cls):
- if name in MODEL_REGISTRY:
- raise ValueError(f"Cannot register duplicate model ({name})")
- if not issubclass(cls, Model):
- raise ValueError(f"Model ({name}: {cls.__name__}) must extend Model")
- MODEL_REGISTRY[name] = cls
- return cls
-
- return __wrapped__
-
-
-def create_model(args, place) -> Model:
- """
- Create a model.
- """
- return MODEL_REGISTRY[args.model](args, place)
-
-
-def add_cmdline_args(parser):
- """ Add cmdline argument of Model. """
- group = parser.add_argument_group("Model")
-
- # Model
- group.add_argument("--model", type=str, required=True)
-
- # Config
- group.add_argument("--config_path", type=str, required=True)
-
- # Model related.
- args, _ = parser.parse_known_args()
- if args.model not in MODEL_REGISTRY:
- raise ValueError(f"Unknown model type: {args.model}")
- MODEL_REGISTRY[args.model].add_cmdline_args(parser)
- return group
-
-
-import plato2_en_large.models.nsp_model
-import plato2_en_large.models.plato
diff --git a/modules/text/text_generation/plato2_en_large/models/generator.py b/modules/text/text_generation/plato2_en_large/models/generator.py
deleted file mode 100644
index 0117f2f7..00000000
--- a/modules/text/text_generation/plato2_en_large/models/generator.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Generator class"""
-
-import numpy as np
-import paddle.fluid.layers as layers
-
-from plato2_en_large.utils.args import str2bool
-
-
-class Generator(object):
- """
- Generator class
-
- Use generator in inference phase.
- """
-
- @classmethod
- def add_cmdline_args(cls, parser):
- """Add cmdline argurments."""
- group = parser.add_argument_group("Generator")
- group.add_argument("--min_dec_len", type=int, default=1)
- group.add_argument("--max_dec_len", type=int, default=64)
-
- group.add_argument(
- "--decoding_strategy",
- type=str,
- default="topk_sampling",
- choices=["beam_search", "topk_sampling", "topp_sampling"])
- group.add_argument("--temperature", type=float, default=1.)
- group.add_argument("--ignore_unk", type=str2bool, default=True)
-
- # multi sampling
- group.add_argument("--num_samples", type=int, default=None)
-
- # top-k sampling
- group.add_argument("--topk", type=int, default=10)
-
- # top-p sampling
- group.add_argument("--topp", type=float, default=0.9)
-
- # beam search
- group.add_argument("--beam_size", type=int, default=10)
- group.add_argument("--length_average", type=str2bool, default=True)
- group.add_argument("--length_penalty", type=float, default=0.0)
-
- return group
-
- def __init__(self, args):
- self.min_dec_len = args.min_dec_len
- self.max_dec_len = args.max_dec_len
- self.eos_id = args.eos_id
- self.unk_id = args.unk_id
- self.mask_id = args.mask_id
- self.vocab_size = args.vocab_size
-
- # model related
-
- # basic settings
- self.decoding_strategy = args.decoding_strategy
- self.ignore_unk = args.ignore_unk
- self.continuous_position = args.continuous_position
- self.temperature = args.temperature
-
- # reranking
- self.num_samples = args.num_samples
-
- # top-k sampling
- self.topk = args.topk
-
- # top-p sampling
- self.topp = args.topp
-
- # beam search
- self.beam_size = args.beam_size
- self.length_penalty = args.length_penalty
- self.length_average = args.length_average
- return
-
- def inference(self, model, inputs, outputs):
- """
- Run inference.
-
- Args:
- inputs(dict): Its key is input name(str) and its value is a Variable.
- model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.
-
- Returns:
- dict(str:Variable): Its key is output name(str) and its value is a Variable.
- """
- # prepare while loop
- max_len = layers.fill_constant(shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True)
- min_len = layers.fill_constant(shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True)
- step_idx = layers.fill_constant(shape=[1], dtype="int64", value=0, force_cpu=True)
-
- ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx)
- pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
- scores = layers.array_write(inputs["init_score"], step_idx)
- tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx)
- parent_idx = inputs["parent_idx"]
-
- if self.decoding_strategy == "beam_search":
- beam_size = self.beam_size
- else:
- beam_size = 1
-
- eos_penalty = np.zeros(self.vocab_size, dtype="float32")
- eos_penalty[self.eos_id] = -1e9
- eos_penalty = layers.assign(eos_penalty)
-
- token_penalty = np.zeros(self.vocab_size, dtype="float32")
- token_penalty[self.unk_id] = -1e9
- if self.mask_id >= 0:
- token_penalty[self.mask_id] = -1e9
- token_penalty = layers.assign(token_penalty)
-
- # start while loop
- cond = layers.less_than(x=step_idx, y=max_len)
- while_op = layers.While(cond)
- with while_op.block():
- pre_ids = layers.array_read(array=ids, i=step_idx)
- pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
- pre_scores = layers.array_read(array=scores, i=step_idx)
- pos_bias = layers.array_read(array=pos_biases, i=step_idx)
- pos_bias = layers.gather(input=pos_bias, index=parent_idx)
-
- tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx)
- dtype = tmp_tgt_generation_mask.dtype
-
- append_mask = layers.fill_constant_batch_size_like(input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype)
- tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2)
- pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx)
-
- pre_sent = layers.fill_constant_batch_size_like(
- input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype)
-
- if self.continuous_position:
- pre_pos = layers.elementwise_mul(
- x=layers.fill_constant_batch_size_like(
- input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype),
- y=step_idx,
- axis=0) + pos_bias
- else:
- pre_pos = layers.elementwise_mul(
- x=layers.fill_constant_batch_size_like(
- input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype),
- y=step_idx,
- axis=0)
-
- dec_out, _ = model._generation_network(
- token_ids=pre_ids,
- type_ids=pre_sent,
- pos_ids=pre_pos,
- generation_mask=tmp_tgt_generation_mask,
- gather_idx=parent_idx)
- logits = model._calc_logits(dec_out)
-
- # ignore unk and mask token
- if self.ignore_unk:
- logits = layers.elementwise_add(logits, token_penalty, axis=1)
-
- # min dec length
- min_len_cond = layers.less_than(x=step_idx, y=min_len)
-
- def min_len_penalty():
- """Plus minimum length penalty."""
- return layers.elementwise_add(logits, eos_penalty, axis=1)
-
- def no_penalty():
- """No penalty."""
- return logits
-
- logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty)
-
- # get probs
- probs = layers.softmax(logits / self.temperature)
-
- if self.decoding_strategy == "beam_search":
- topk_scores, topk_indices = layers.topk(input=probs, k=beam_size)
- else:
- if self.decoding_strategy.startswith("sampling"):
- sampling_ids = layers.sampling_id(probs, dtype="int")
- elif self.decoding_strategy.startswith("topk_sampling"):
- topk_probs, _ = layers.topk(input=probs, k=self.topk)
- ge_cond = layers.cast(
- layers.greater_equal(probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32")
- old_probs = probs
- probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True)
- sampling_ids = layers.sampling_id(probs, dtype="int")
- probs = old_probs
- elif self.decoding_strategy.startswith("topp_sampling"):
- sorted_probs, sorted_idx = layers.argsort(probs, descending=True)
- cum_sorted_probs = layers.cumsum(sorted_probs, axis=1, exclusive=True)
- lt_cond = layers.cast(
- layers.less_than(
- cum_sorted_probs,
- layers.fill_constant_batch_size_like(cum_sorted_probs, cum_sorted_probs.shape,
- cum_sorted_probs.dtype, self.topp)), "float32")
- old_probs = probs
- candidate_probs = sorted_probs * lt_cond
- probs = candidate_probs / layers.reduce_sum(candidate_probs, dim=-1, keep_dim=True)
- sampling_ids = layers.sampling_id(probs, dtype="int")
- sampling_ids = layers.index_sample(sorted_idx, layers.unsqueeze(sampling_ids, [1]))
- sampling_ids = layers.squeeze(sampling_ids, [1])
- probs = old_probs
- else:
- raise ValueError(self.decoding_strategy)
-
- sampling_scores = layers.one_hot(layers.unsqueeze(sampling_ids, [1]), probs.shape[1])
- sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3
- topk_scores, topk_indices = layers.topk(input=sampling_scores, k=1)
-
- pre_len = layers.cast(step_idx, "float32")
- layers.increment(x=step_idx, value=1.0, in_place=True)
- cur_len = layers.cast(step_idx, "float32")
-
- # update scores
- if self.length_average:
- accu_scores = layers.elementwise_add(
- x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len
- elif self.length_penalty > 0:
- pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
- cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
- accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp
- else:
- accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0)
- topk_indices = layers.lod_reset(topk_indices, pre_ids)
- accu_scores = layers.lod_reset(accu_scores, pre_ids)
- selected_ids, selected_scores, gather_idx = layers.beam_search(
- pre_ids=pre_ids,
- pre_scores=pre_scores,
- ids=topk_indices,
- scores=accu_scores,
- beam_size=beam_size,
- end_id=self.eos_id,
- return_parent_idx=True)
-
- layers.array_write(selected_ids, i=step_idx, array=ids)
- layers.array_write(selected_scores, i=step_idx, array=scores)
- layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
- layers.array_write(pos_bias, i=step_idx, array=pos_biases)
-
- layers.assign(gather_idx, parent_idx)
-
- length_cond = layers.less_than(x=step_idx, y=max_len)
- finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
- layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
- finished_ids, finished_scores = layers.beam_search_decode(ids, scores, beam_size=beam_size, end_id=self.eos_id)
-
- predictions = {
- "finished_ids": finished_ids,
- "finished_scores": finished_scores,
- "token_ids": inputs["token_ids"],
- "data_id": inputs["data_id"]
- }
- return predictions
diff --git a/modules/text/text_generation/plato2_en_large/models/model_base.py b/modules/text/text_generation/plato2_en_large/models/model_base.py
deleted file mode 100644
index 6edcadda..00000000
--- a/modules/text/text_generation/plato2_en_large/models/model_base.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Model base."""
-
-from abc import abstractmethod, ABC
-
-import paddle.fluid as fluid
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-import paddle.fluid.layers as layers
-
-from plato2_en_large.models.optimizer import AdamW
-from plato2_en_large.utils import init_pretraining_params, init_checkpoint, to_lodtensor
-from plato2_en_large.utils.args import str2bool
-
-
-class Model(ABC):
- """
- Basic model wrapper for paddle.
- """
-
- @classmethod
- def add_cmdline_args(cls, parser):
- """Add cmdline argurments."""
- group = parser.add_argument_group("Model")
- # Init checkpoint
- group.add_argument("--init_checkpoint", type=str, default="")
- group.add_argument("--init_pretraining_params", type=str, default="")
-
- # Optimizer
- group.add_argument("-lr", "--learning_rate", type=float, default=1e-5, help="The learning rate for optimizer.")
- group.add_argument("--warmup_steps", type=int, default=0, help="The warmup steps.")
- group.add_argument("--weight_decay", type=float, default=0.0, help="The weight decay for optimizer.")
- group.add_argument("--max_grad_norm", type=float, default=.1, help="The maximum norm of gradient.")
-
- group.add_argument("--use_recompute", type=str2bool, default=False)
- group.add_argument("--use_amp", type=str2bool, default=False)
- group.add_argument("--amp_loss_scaling", type=float, default=12800)
- return group
-
- def __init__(self, args, place):
- self.place = place
- self.exe = fluid.Executor(place)
-
- self.init_checkpoint = args.init_checkpoint
- self.init_pretraining_params = args.init_pretraining_params
-
- self.learning_rate = args.learning_rate
- self.warmup_steps = args.warmup_steps
- self.weight_decay = args.weight_decay
- self.max_grad_norm = args.max_grad_norm
-
- self.is_distributed = args.is_distributed
- self.use_recompute = args.use_recompute
- self.use_amp = args.use_amp
- self.amp_loss_scaling = args.amp_loss_scaling
- self.run_infer = args.get("run_infer", False)
- self.batch_size = args.get("batch_size", 1)
- self._build_programs()
- return
-
- def _build_programs(self):
- """
- Build programs.
-
- Build train_program, eval_program and inference_program. Only use in static graph mode.
- """
- if self.run_infer:
- self.startup_program = fluid.Program()
- # build infer program
- self.infer_program = fluid.Program()
- with fluid.program_guard(self.infer_program, self.startup_program):
- with fluid.unique_name.guard():
- self.infer_feed_dict = inputs = self._get_feed_dict(is_infer=True)
- outputs = self.forward(inputs, is_infer=True)
- predictions = self.infer(inputs, outputs)
- self.infer_fetch_dict = predictions
- self.infer_program = self.infer_program.clone(for_test=True)
-
- self.program = self.infer_program
- else:
- if self.is_distributed:
- exec_strategy = fluid.ExecutionStrategy()
- exec_strategy.use_experimental_executor = True
- exec_strategy.num_threads = 4
- exec_strategy.num_iteration_per_drop_scope = 1
-
- dist_strategy = DistributedStrategy()
- dist_strategy.exec_strategy = exec_strategy
- dist_strategy.nccl_comm_num = 1
- dist_strategy.fuse_all_reduce_ops = True
- if self.use_recompute:
- dist_strategy.forward_recompute = True
- dist_strategy.enable_sequential_execution = True
- if self.use_amp:
- dist_strategy.use_amp = True
- dist_strategy.amp_loss_scaling = self.amp_loss_scaling
- self.dist_strategy = dist_strategy
-
- self.startup_program = fluid.Program()
- # build train program
- self.train_program = fluid.Program()
- with fluid.program_guard(self.train_program, self.startup_program):
- with fluid.unique_name.guard():
- self.feed_dict = inputs = self._get_feed_dict()
- outputs = self.forward(inputs)
- if self.is_distributed and self.use_recompute:
- self.dist_strategy.recompute_checkpoints = outputs["checkpoints"]
- metrics, statistics = self.get_metrics_and_statistics(inputs, outputs)
-
- # build eval program
- self.eval_program = self.train_program.clone(for_test=True)
- self.eval_fetch_dict = {**metrics, **statistics}
-
- scheduled_lr = self.optimize(metrics)
- metrics["scheduled_lr"] = scheduled_lr
- self.train_fetch_dict = metrics
-
- self.program = self.train_program
- if self.is_distributed:
- self.train_program = fleet.main_program
-
- self.exe.run(self.startup_program)
- if self.init_pretraining_params != "":
- init_pretraining_params(self.exe, self.init_pretraining_params, self.program)
- elif self.init_checkpoint != "":
- init_checkpoint(self.exe, self.init_checkpoint, self.program)
- return
-
- def load(self, model_dir, is_checkpoint=False):
- """
- Load persistables or parameters.
- """
- # TODO: support dygraph.
- if is_checkpoint:
- init_checkpoint(self.exe, model_dir, self.program)
- else:
- init_pretraining_params(self.exe, model_dir, self.program)
- return
-
- def save(self, model_dir, is_checkpoint=False):
- """
- Save persistables or parameters.
- """
- # TODO: support dygraph.
- if is_checkpoint:
- fluid.io.save_persistables(self.exe, model_dir, self.program)
- else:
- fluid.io.save_params(self.exe, model_dir, self.program)
- return
-
- @abstractmethod
- def _get_feed_dict(self, is_infer=False):
- """
- Return input feed list.
- """
- pass
-
- def _get_feed(self, inputs, is_infer=False):
- """
- Convert `inputs` into model's feed data format.
- """
- if isinstance(inputs, list):
- # return list direclty which is used in `get_data_loader`.
- return inputs
- for k in inputs:
- if isinstance(inputs[k], list):
- inputs[k] = to_lodtensor(inputs[k], self.place)
- return inputs
-
- def get_data_loader(self, generator=None, is_infer=False):
- """
- Return DataLoader.
-
- If generator is not `None`, the data loader set it as the batch generator.
- """
- # TODO: support dygraph.
- if is_infer:
- feed_name_list, feed_list = zip(*self.infer_feed_dict.items())
- else:
- feed_name_list, feed_list = zip(*self.feed_dict.items())
- loader = fluid.io.DataLoader.from_generator(
- feed_list=feed_list, capacity=64, use_double_buffer=True, iterable=True)
- if generator is not None:
-
- def __wrapper__():
- for batch in generator():
- batch = self._get_feed(batch)
- batch = [batch[name] for name in feed_name_list if name in batch]
- yield batch
-
- loader.set_batch_generator(__wrapper__, self.place)
- return loader
-
- @abstractmethod
- def forward(self, inputs, is_infer=False):
- """
- Run model main forward.
- """
- pass
-
- @abstractmethod
- def get_metrics_and_statistics(self, inputs, outputs):
- """
- Get metrics and statistics.
- """
- pass
-
- @abstractmethod
- def infer(self, inputs, outputs):
- """
- Run model inference.
- """
- pass
-
- def optimize(self, metrics):
- """
- Optimize the model by metrics(mainly `metrics["loss"]`).
- """
- # TODO: support dygraph
- if self.warmup_steps > 0:
- scheduled_lr = layers.learning_rate_scheduler.noam_decay(1 / (self.warmup_steps * (self.learning_rate**2)),
- self.warmup_steps)
- else:
- scheduled_lr = layers.create_global_var(
- name=fluid.unique_name.generate("learning_rate"),
- shape=[1],
- value=self.learning_rate,
- dtype="float32",
- persistable=True)
- grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm)
-
- self.optimizer = AdamW(learning_rate=scheduled_lr, grad_clip=grad_clip, weight_decay=self.weight_decay)
-
- if self.is_distributed:
- self.optimizer = fleet.distributed_optimizer(self.optimizer, strategy=self.dist_strategy)
-
- self.optimizer.minimize(metrics["loss"])
- return scheduled_lr
-
- def _execute(self, program, feed, fetch_dict, **kwargs):
- """
- Execute program.
- """
- fetch_list = [var.name for var in fetch_dict.values()]
- fetch_vars = self.exe.run(program, feed, fetch_list, **kwargs)
- return dict(zip(fetch_dict.keys(), fetch_vars))
-
- def train_step(self, inputs):
- """
- Run one training step.
- """
- # TODO: support dygraph.
- return self._execute(self.train_program, self._get_feed(inputs), self.train_fetch_dict, use_program_cache=True)
-
- def eval_step(self, inputs):
- """
- Run one evaluation step.
- """
- # TODO: support dygraph.
- return self._execute(self.eval_program, self._get_feed(inputs), self.eval_fetch_dict)
-
- def infer_step(self, inputs):
- """
- Run one inference step.
- """
- # TODO: support dygraph.
- return self._execute(self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict)
-
- def save_inference_model(self, inference_model_path):
- """
- Save the inference model.
- """
- feed_list = [var.name for var in self.infer_feed_dict.values()]
- fetch_list = list(self.infer_fetch_dict.values())
-
- fluid.io.save_inference_model(inference_model_path, feed_list, fetch_list, self.exe, self.infer_program)
diff --git a/modules/text/text_generation/plato2_en_large/models/nsp_model.py b/modules/text/text_generation/plato2_en_large/models/nsp_model.py
deleted file mode 100644
index 8c8f2d9a..00000000
--- a/modules/text/text_generation/plato2_en_large/models/nsp_model.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""NSP model."""
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-from . import register_model
-from .model_base import Model
-from .unified_transformer import UnifiedTransformer
-
-
-@register_model("NSPModel")
-class NSPModel(UnifiedTransformer):
- """NSP model."""
-
- def _get_feed_dict(self, is_infer=False):
- """
- Get the feed list of the model.
-
- Args:
- is_infer(bool): True if running inference.
-
- Returns:
- list(Variable): The feed list.
- list(str): The name of each Variable in feed list.
- """
- feed_dict = {}
- feed_dict["token_ids"] = layers.data(name="token_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
- feed_dict["type_ids"] = layers.data(name="type_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
- feed_dict["pos_ids"] = layers.data(name="pos_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
-
- feed_dict["attention_mask"] = layers.data(
- name="attention_mask", shape=[-1, self.max_seq_len, self.max_seq_len], dtype=self.dtype)
- feed_dict["label_pos"] = layers.data(name="label_pos", shape=[-1, 1], dtype="int64")
-
- if not is_infer:
- feed_dict["label"] = layers.data(name="label", shape=[-1, 1], dtype="int64")
- feed_dict["tgt_label"] = layers.data(name="tgt_ids", shape=[-1, 1], dtype="int64")
- feed_dict["tgt_pos"] = layers.data(name="tgt_pos", shape=[-1, 1], dtype="int64")
-
- feed_dict["data_id"] = layers.data(name="data_id", shape=[-1, 1], dtype="int64")
- return feed_dict
-
- def _get_feed(self, inputs, is_infer=False):
- return Model._get_feed(self, inputs, is_infer)
-
- def forward(self, inputs, is_infer=False):
- outputs = {}
- self.generation_caches = None
- outputs["enc_out"], self.checkpoints = self._generation_network(
- token_ids=inputs["token_ids"],
- type_ids=inputs["type_ids"],
- pos_ids=inputs["pos_ids"],
- generation_mask=inputs["attention_mask"])
- return outputs
-
- def _get_metrics(self, inputs, outputs):
- metrics = {}
- fc_out = self._calc_logits(outputs["enc_out"], inputs["tgt_pos"])
- lm_loss = layers.softmax_with_cross_entropy(logits=fc_out, label=inputs["tgt_pos"])
- need_cal = layers.not_equal(inputs["tgt_label"], layers.fill_constant(shape=[1], dtype="int64", value=1))
- need_cal = layers.cast(need_cal, self.dtype)
- mean_lm_loss = layers.reduce_sum(lm_loss * need_cal) / (layers.reduce_sum(need_cal) + 1e-10)
-
- pooled_out = self._get_pooled_output(outputs["enc_out"], inputs["label_pos"])
- nsp_fc_out = layers.fc(
- input=pooled_out,
- size=2,
- param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", initializer=self.param_initializer),
- bias_attr="next_sent_fc.b_0")
- nsp_loss, nsp_softmax = layers.softmax_with_cross_entropy(
- logits=nsp_fc_out, label=inputs["label"], return_softmax=True)
-
- nsp_acc = layers.accuracy(nsp_softmax, inputs["label"])
- mean_nsp_loss = layers.mean(nsp_loss)
-
- metrics["loss"] = mean_lm_loss + mean_nsp_loss
- metrics["lm_loss"] = mean_lm_loss
- metrics["nsp_loss"] = mean_nsp_loss
- metrics["nsp_acc"] = nsp_acc
- return metrics
-
- def infer(self, inputs, outputs):
- pooled_out = self._get_pooled_output(outputs["enc_out"], inputs["label_pos"])
- nsp_fc_out = layers.fc(
- input=pooled_out,
- size=2,
- param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", initializer=self.param_initializer),
- bias_attr="next_sent_fc.b_0")
- scores = layers.softmax(nsp_fc_out)
- predictions = {"scores": scores, "data_id": inputs["data_id"]}
- return predictions
-
- def infer_step(self, inputs):
- return Model.infer_step(self, inputs)
diff --git a/modules/text/text_generation/plato2_en_large/models/optimizer.py b/modules/text/text_generation/plato2_en_large/models/optimizer.py
deleted file mode 100644
index 0381f123..00000000
--- a/modules/text/text_generation/plato2_en_large/models/optimizer.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Optimizer."""
-
-import re
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-class AdamW(fluid.optimizer.AdamOptimizer):
- """AdamW object for dygraph"""
-
- def __init__(self, *args, **kwargs):
- weight_decay = kwargs.pop('weight_decay', None)
- var_name_to_exclude = kwargs.pop('var_name_to_exclude', '.*layer_norm_scale|.*layer_norm_bias|.*b_0')
- super(AdamW, self).__init__(*args, **kwargs)
- self.wd = weight_decay
- self.pat = re.compile(var_name_to_exclude)
-
- def apply_optimize(self, loss, startup_program, params_grads):
- """Update params with weight decay."""
- super(AdamW, self).apply_optimize(loss, startup_program, params_grads)
- for p, g in params_grads:
- if not self.pat.match(p.name):
- layers.assign(p * (1. - self.wd * self._learning_rate), p)
diff --git a/modules/text/text_generation/plato2_en_large/models/plato.py b/modules/text/text_generation/plato2_en_large/models/plato.py
deleted file mode 100644
index 987a42d7..00000000
--- a/modules/text/text_generation/plato2_en_large/models/plato.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Plato model."""
-
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-from . import register_model
-from .model_base import Model
-from .unified_transformer import UnifiedTransformer
-from .transformer_block import encoder, pre_process_layer
-from plato2_en_large.utils import repeat_array_or_tensor
-from plato2_en_large.utils.args import str2bool
-from .generator import Generator
-
-
-@register_model("Plato")
-class Plato(UnifiedTransformer):
- """Plato model."""
-
- @classmethod
- def add_cmdline_args(cls, parser):
- """Add cmdline argurments."""
- group = UnifiedTransformer.add_cmdline_args(parser)
- group.add_argument("--use_bow", type=str2bool, default=True)
- group.add_argument("--use_entropy", type=str2bool, default=False)
- return group
-
- def __init__(self, args, place):
- # latent related
- self.mask_id = args.mask_id
- self.latent_type_size = args.latent_type_size
- self.latent_emb_name = "latent_embedding"
- self.use_bow = args.use_bow
- self.use_entropy = args.use_entropy
-
- super(Plato, self).__init__(args, place)
-
- def _get_feed_dict(self, is_infer=False):
- feed_dict = {}
- feed_dict["token_ids"] = layers.data(name="token_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
- feed_dict["type_ids"] = layers.data(name="type_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
- feed_dict["pos_ids"] = layers.data(name="pos_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
-
- if not is_infer:
- feed_dict["recognition_mask"] = layers.data(
- name="recognition_mask", shape=[-1, self.max_seq_len + 1, self.max_seq_len + 1], dtype=self.dtype)
- feed_dict["generation_mask"] = layers.data(
- name="generation_mask", shape=[-1, self.max_seq_len + 1, self.max_seq_len + 1], dtype=self.dtype)
-
- if is_infer:
- feed_dict["tgt_ids"] = layers.data(
- name="tgt_ids", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2)
- feed_dict["tgt_pos"] = layers.data(
- name="tgt_pos", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2)
- feed_dict["init_score"] = layers.data(name="init_score", shape=[-1, 1], dtype="float32", lod_level=1)
- feed_dict["parent_idx"] = layers.data(name="parent_idx", shape=[-1], dtype="int64")
-
- feed_dict["tgt_generation_mask"] = layers.data(
- name="tgt_generation_mask", shape=[-1, 1, self.max_seq_len + 1], dtype="float32")
- feed_dict["latent_id"] = layers.data(name="latent_id", shape=[-1, 1], dtype="int64")
- else:
- feed_dict["tgt_label"] = layers.data(name="tgt_label", shape=[-1, 1], dtype="int64")
- feed_dict["tgt_pos"] = layers.data(name="tgt_pos", shape=[-1, 1], dtype="int64")
-
- if self.use_bow:
- feed_dict["bow_label"] = layers.data(name="bow_label", shape=[-1, 1], dtype="int64")
- feed_dict["bow_pos"] = layers.data(name="bow_pos", shape=[-1, 1], dtype="int64")
-
- feed_dict["data_id"] = layers.data(name="data_id", shape=[-1, 1], dtype="int64")
- return feed_dict
-
- def _recognition_network(self, token_ids, type_ids, pos_ids, recognition_mask):
- mask_id = layers.fill_constant_batch_size_like(
- input=token_ids, shape=[-1, 1, 1], value=self.mask_id, dtype="int64")
- mask_emb = layers.embedding(
- input=mask_id,
- size=[self.vocab_size, self.emb_size],
- dtype=self.dtype,
- param_attr=fluid.ParamAttr(name=self.token_emb_name, initializer=self.param_initializer))
- emb_out, n_head_self_attn_mask = self._gen_input(
- token_ids, type_ids, pos_ids, recognition_mask, aux_emb=mask_emb)
-
- recognition_out, checkpoints = self._encode(emb_out, n_head_self_attn_mask)
-
- recognition_feat = layers.slice(input=recognition_out, axes=[1], starts=[0], ends=[1])
- recognition_feat = layers.fc(
- input=recognition_feat,
- size=self.hidden_size,
- act="tanh",
- param_attr=fluid.ParamAttr(name="recognition_fc.w_0", initializer=self.param_initializer),
- bias_attr="recognition_fc.b_0")
- logits = layers.fc(
- input=recognition_feat,
- size=self.latent_type_size,
- param_attr=fluid.ParamAttr(name=self.latent_emb_name, initializer=self.param_initializer),
- bias_attr="recognition_bias")
- return logits, checkpoints
-
- def _gumbel_softmax(self, logits, tau=0.67, eps=1e-10):
- u = layers.uniform_random_batch_size_like(logits, shape=[-1, self.latent_type_size], min=0.0, max=1.0)
- u.stop_gradient = True
- gumbel = 0.0 - layers.log(eps - layers.log(u + eps))
- y = logits + gumbel
- return layers.softmax(y / tau)
-
- def forward(self, inputs, is_infer=False):
- """
- Run model main forward.
- """
- outputs = {}
- if is_infer:
- self.generation_caches = [{
- "k":
- layers.fill_constant_batch_size_like(
- input=inputs["token_ids"], shape=[-1, 0, self.d_key * self.n_head], dtype=self.dtype, value=0),
- "v":
- layers.fill_constant_batch_size_like(
- input=inputs["token_ids"], shape=[-1, 0, self.d_value * self.n_head], dtype=self.dtype, value=0),
- } for i in range(self.n_layer)]
- else:
- self.generation_caches = None
-
- latent_embeddings = layers.create_parameter(
- shape=[self.emb_size, self.latent_type_size],
- dtype=self.dtype,
- attr=fluid.ParamAttr(name=self.latent_emb_name, initializer=self.param_initializer))
-
- if is_infer:
- latent_id = inputs["latent_id"]
- weights = layers.one_hot(latent_id, self.latent_type_size)
- else:
- logits, recognition_checkpoints = self._recognition_network(
- token_ids=inputs["token_ids"],
- type_ids=inputs["type_ids"],
- pos_ids=inputs["pos_ids"],
- recognition_mask=inputs["recognition_mask"],
- )
- outputs["post_probs"] = layers.softmax(logits)
- weights = self._gumbel_softmax(logits)
- outputs["checkpoints"] = recognition_checkpoints
-
- latent_emb = layers.matmul(x=weights, y=latent_embeddings, transpose_y=True)
- outputs["enc_out"], generation_checkpoints = self._generation_network(
- token_ids=inputs["token_ids"],
- type_ids=inputs["type_ids"],
- pos_ids=inputs["pos_ids"],
- generation_mask=inputs["generation_mask"],
- aux_emb=layers.unsqueeze(latent_emb, axes=[1]),
- gather_idx=inputs.get("parent_idx", None),
- )
-
- if not is_infer:
- outputs["checkpoints"].extend(generation_checkpoints)
- return outputs
-
- def _calc_bow_logits(self, enc_out, checkpoints, bow_pos):
- """Get the logits of generation."""
- bow_feat = layers.slice(input=enc_out, axes=[1], starts=[0], ends=[1])
- bow_feat = layers.reshape(x=bow_feat, shape=[-1, self.hidden_size])
- bow_pos = layers.cast(x=bow_pos, dtype="int32")
- bow_feat = layers.gather(input=bow_feat, index=bow_pos)
-
- bow_trans_feat = layers.fc(
- input=bow_feat,
- size=self.emb_size,
- act=self.hidden_act,
- param_attr=fluid.ParamAttr(name="bow_trans_fc.w_0", initializer=self.param_initializer),
- bias_attr=fluid.ParamAttr(name="bow_trans_fc.b_0"))
-
- bow_trans_feat = pre_process_layer(bow_trans_feat, self.post_cls_cmd, name="bow_trans")
-
- checkpoints.append(bow_trans_feat)
-
- if self.weight_sharing:
- fc_out = layers.matmul(
- x=bow_trans_feat,
- y=fluid.default_main_program().global_block().var(self.token_emb_name),
- transpose_y=True)
- if self.cls_bias:
- fc_out += layers.create_parameter(
- shape=[self.vocab_size],
- dtype=self.dtype,
- attr=fluid.ParamAttr(name="bow_out_fc.b_0"),
- is_bias=True)
- else:
- bow_out_bias_attr = fluid.ParamAttr(name="bow_out_fc.b_0") if self.cls_bias else False
- fc_out = layers.fc(
- input=bow_trans_feat,
- size=self.vocab_size,
- param_attr=fluid.ParamAttr(name="bow_out_fc.w_0", initializer=self.param_initializer),
- bias_attr=bow_out_bias_attr)
- return fc_out
-
- def _get_metrics(self, inputs, outputs):
- metrics = super(Plato, self)._get_metrics(inputs, outputs)
-
- if self.use_bow:
- fc_out = self._calc_bow_logits(outputs["enc_out"], outputs["checkpoints"], inputs["bow_pos"])
- bow_loss = layers.softmax_with_cross_entropy(logits=fc_out, label=inputs["bow_label"])
- mean_bow_loss = layers.mean(bow_loss)
- metrics["token_bow_loss"] = mean_bow_loss
- metrics["loss"] = metrics["loss"] + mean_bow_loss
-
- entropy_loss = layers.reduce_sum(outputs["post_probs"] * layers.log(outputs["post_probs"]), dim=1)
- mean_entropy_loss = layers.mean(entropy_loss)
- metrics["entropy_loss"] = mean_entropy_loss
- if self.use_entropy:
- metrics["loss"] = metrics["loss"] + mean_entropy_loss
- return metrics
-
- def infer_step(self, inputs):
- """
- Run one inference step.
- """
- if self.do_generation:
- batch_size = len(inputs["data_id"])
- new_bsz = batch_size * self.latent_type_size
- inputs = {
- name: repeat_array_or_tensor(array_or_tensor, self.place, self.latent_type_size)
- for name, array_or_tensor in inputs.items()
- }
- # Add latent_id
- inputs["latent_id"] = np.array([i for i in range(self.latent_type_size) for _ in range(batch_size)],
- dtype="int64").reshape([-1, 1])
-
- return super(Plato, self).infer_step(inputs)
- else:
- return self._execute(self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict)
diff --git a/modules/text/text_generation/plato2_en_large/models/transformer_block.py b/modules/text/text_generation/plato2_en_large/models/transformer_block.py
deleted file mode 100644
index 041306a5..00000000
--- a/modules/text/text_generation/plato2_en_large/models/transformer_block.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer block."""
-
-from functools import partial
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-
-def multi_head_attention(queries,
- keys,
- values,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head=1,
- dropout_rate=0.,
- cache=None,
- gather_idx=None,
- store=False,
- param_initializer=None,
- name="multi_head_att"):
- """
- Multi-Head Attention. Note that attn_bias is added to the logit before
- computing softmax activiation to mask certain selected positions so that
- they will not considered in attention weights.
- """
- keys = queries if keys is None else keys
- values = keys if values is None else values
-
- if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
- raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.")
-
- def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
- """
- Add linear projection to queries, keys, and values.
- """
- q = layers.fc(
- input=queries,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + "_query_fc.w_0", initializer=param_initializer),
- bias_attr=name + "_query_fc.b_0")
- k = layers.fc(
- input=keys,
- size=d_key * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + "_key_fc.w_0", initializer=param_initializer),
- bias_attr=name + "_key_fc.b_0")
- v = layers.fc(
- input=values,
- size=d_value * n_head,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + "_value_fc.w_0", initializer=param_initializer),
- bias_attr=name + "_value_fc.b_0")
- return q, k, v
-
- def __split_heads(x, n_head):
- """
- Reshape the last dimension of inpunt tensor x so that it becomes two
- dimensions and then transpose. Specifically, input a tensor with shape
- [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
- with shape [bs, n_head, max_sequence_length, hidden_dim].
- """
- hidden_size = x.shape[-1]
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
-
- # permuate the dimensions into:
- # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
- return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
- def __combine_heads(x):
- """
- Transpose and then reshape the last two dimensions of inpunt tensor x
- so that it becomes one dimension, which is reverse to __split_heads.
- """
- if len(x.shape) == 3: return x
- if len(x.shape) != 4:
- raise ValueError("Input(x) should be a 4-D Tensor.")
-
- trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
- # The value 0 in shape attr means copying the corresponding dimension
- # size of the input as the output dimension size.
- return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
-
- def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
- """
- Scaled Dot-Product Attention
- """
- scaled_q = layers.scale(x=q, scale=d_key**-0.5)
- product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
- if attn_bias:
- product += attn_bias
- weights = layers.softmax(product, use_cudnn=True)
- if dropout_rate:
- weights = layers.dropout(
- weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False)
- out = layers.matmul(weights, v)
- return out
-
- q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
- if cache is not None: # use cache and concat time steps
- # Since the inplace reshape in __split_heads changes the shape of k and
- # v, which is the cache input for next time step, reshape the cache
- # input from the previous time step first.
- cache_k, cache_v = cache["k"], cache["v"]
- select_k = layers.gather(cache_k, index=gather_idx)
- select_v = layers.gather(cache_v, index=gather_idx)
- select_k = layers.reshape(select_k, shape=[0, 0, d_key * n_head])
- select_v = layers.reshape(select_v, shape=[0, 0, d_value * n_head])
- if store:
- k = layers.concat([select_k, k], axis=1)
- v = layers.concat([select_v, v], axis=1)
- layers.assign(k, cache["k"])
- layers.assign(v, cache["v"])
- else:
- #k = select_k
- #v = select_v
- tmp_k = layers.concat([select_k, k[:, :1]], axis=1)
- tmp_v = layers.concat([select_v, v[:, :1]], axis=1)
- layers.assign(tmp_k, cache["k"])
- layers.assign(tmp_v, cache["v"])
- k = layers.concat([select_k, k], axis=1)
- v = layers.concat([select_v, v], axis=1)
-
- q = __split_heads(q, n_head)
- k = __split_heads(k, n_head)
- v = __split_heads(v, n_head)
-
- ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
-
- out = __combine_heads(ctx_multiheads)
-
- # Project back to the model size.
- proj_out = layers.fc(
- input=out,
- size=d_model,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + "_output_fc.w_0", initializer=param_initializer),
- bias_attr=name + "_output_fc.b_0")
- return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name="ffn"):
- """
- Position-wise Feed-Forward Networks.
- This module consists of two linear transformations with a ReLU activation
- in between, which is applied to each position separately and identically.
- """
- hidden = layers.fc(
- input=x,
- size=d_inner_hid,
- num_flatten_dims=2,
- act=hidden_act,
- param_attr=fluid.ParamAttr(name=name + "_fc_0.w_0", initializer=param_initializer),
- bias_attr=name + "_fc_0.b_0")
- if dropout_rate:
- hidden = layers.dropout(
- hidden, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False)
- out = layers.fc(
- input=hidden,
- size=d_hid,
- num_flatten_dims=2,
- param_attr=fluid.ParamAttr(name=name + "_fc_1.w_0", initializer=param_initializer),
- bias_attr=name + "_fc_1.b_0")
- return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., epsilon=1e-5, name=""):
- """
- Add residual connection, layer normalization and droput to the out tensor
- optionally according to the value of process_cmd.
- This will be used before or after multi-head attention and position-wise
- feed-forward networks.
- """
- for cmd in process_cmd:
- if cmd == "a": # add residual connection
- out = out + prev_out if prev_out else out
- elif cmd == "n": # add layer normalization
- out = layers.layer_norm(
- out,
- begin_norm_axis=len(out.shape) - 1,
- param_attr=fluid.ParamAttr(name=name + "_layer_norm_scale", initializer=fluid.initializer.Constant(1.)),
- bias_attr=fluid.ParamAttr(name=name + "_layer_norm_bias", initializer=fluid.initializer.Constant(0.)),
- epsilon=epsilon)
- elif cmd == "d": # add dropout
- if dropout_rate:
- out = layers.dropout(
- out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False)
- return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def encoder_layer(input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name="",
- epsilon=1e-5,
- cache=None,
- gather_idx=None,
- store=False):
- """
- The encoder layers that can be stacked to form a deep encoder.
- This module consits of a multi-head (self) attention followed by
- position-wise feed-forward networks and both the two components companied
- with the pre_process_layer / post_process_layer to add residual connection,
- layer normalization and droput.
- """
- attn_output = multi_head_attention(
- pre_process_layer(input, preprocess_cmd, prepostprocess_dropout, epsilon=epsilon, name=name + "_pre_att"),
- None,
- None,
- attn_bias,
- d_key,
- d_value,
- d_model,
- n_head,
- attention_dropout,
- param_initializer=param_initializer,
- name=name + "_multi_head_att",
- cache=cache,
- gather_idx=gather_idx,
- store=store)
- attn_output = post_process_layer(
- input, attn_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_att", epsilon=epsilon)
- ffd_output = positionwise_feed_forward(
- pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout, epsilon=epsilon, name=name + "_pre_ffn"),
- d_inner_hid,
- d_model,
- relu_dropout,
- hidden_act,
- param_initializer=param_initializer,
- name=name + "_ffn")
- ffd_output = post_process_layer(
- attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_ffn", epsilon=epsilon)
- return ffd_output, [attn_output, ffd_output]
-
-
-def encoder(enc_input,
- attn_bias,
- n_layer,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd="n",
- postprocess_cmd="da",
- param_initializer=None,
- name="",
- epsilon=1e-5,
- n_layer_per_block=1,
- param_share="normal",
- caches=None,
- gather_idx=None,
- store=False):
- """
- The encoder is composed of a stack of identical layers returned by calling
- encoder_layer.
- """
- checkpoints = []
- names = []
- if param_share == "inner_share":
- for _ in range(n_layer // n_layer_per_block):
- for i in range(n_layer_per_block):
- names.append(name + "_layer_" + str(i))
- else:
- for i in range(n_layer // n_layer_per_block):
- for _ in range(n_layer_per_block):
- names.append(name + "_layer_" + str(i))
-
- for i in range(n_layer):
- enc_output, cps = encoder_layer(
- enc_input,
- attn_bias,
- n_head,
- d_key,
- d_value,
- d_model,
- d_inner_hid,
- prepostprocess_dropout,
- attention_dropout,
- relu_dropout,
- hidden_act,
- preprocess_cmd,
- postprocess_cmd,
- param_initializer=param_initializer,
- epsilon=epsilon,
- name=names[i],
- cache=caches[i] if caches is not None else None,
- gather_idx=gather_idx,
- store=store)
- checkpoints.extend(cps)
- enc_input = enc_output
- enc_output = pre_process_layer(
- enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder", epsilon=epsilon)
-
- return enc_output, checkpoints
diff --git a/modules/text/text_generation/plato2_en_large/models/unified_transformer.py b/modules/text/text_generation/plato2_en_large/models/unified_transformer.py
deleted file mode 100644
index c4ebf115..00000000
--- a/modules/text/text_generation/plato2_en_large/models/unified_transformer.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Unified Transformer model."""
-
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-from . import register_model
-from .model_base import Model
-from .transformer_block import encoder, pre_process_layer
-from plato2_en_large.utils.args import str2bool
-from plato2_en_large.utils import repeat_array_or_tensor, slice_array_or_tensor
-from .generator import Generator
-
-
-@register_model("UnifiedTransformer")
-class UnifiedTransformer(Model):
- """Unified Transformer"""
-
- @classmethod
- def add_cmdline_args(cls, parser):
- """Add cmdline argurments."""
- group = Model.add_cmdline_args(parser)
- group.add_argument("--max_seq_len", type=int, default=256)
- group.add_argument("--weight_sharing", type=str2bool, default=True)
- group.add_argument("--mem_efficient", type=str2bool, default=False)
-
- Generator.add_cmdline_args(parser)
- return group
-
- def __init__(self, args, place):
- self.max_seq_len = args.max_seq_len
-
- self.emb_size = args.emb_size or args.hidden_size
- self.hidden_size = args.hidden_size
-
- self.n_layer = args.num_hidden_layers
- self.n_head = args.num_attention_heads
- self.d_key = args.get("key_size", self.hidden_size // self.n_head)
- self.d_value = args.get("value_size", self.hidden_size // self.n_head)
- self.inner_hidden_size = args.get("inner_hidden_size", self.hidden_size * 4)
-
- self.vocab_size = args.vocab_size
- self.max_position_seq_len = args.max_position_embeddings
- self.type_size = args.type_vocab_size
- self.token_emb_name = "word_embedding"
- self.type_emb_name = "sent_embedding"
- self.pos_emb_name = "pos_embedding"
-
- self.epsilon = args.epsilon or 1e-5
- self.n_layer_per_block = args.n_layer_per_block or 1
- self.pre_encoder_cmd = args.get("pre_encoder_cmd", "nd")
- self.preprocess_cmd = args.get("preprocess_cmd", "")
- self.postprocess_cmd = args.get("postprocess_cmd", "dan")
- self.post_cls_cmd = args.get("post_cls_cmd", "n")
- self.cls_bias = args.get("cls_bias", True)
- if self.hidden_size != self.emb_size:
- self.emb_mapping_in = True
- else:
- self.emb_mapping_in = args.get("emb_mapping_in", False)
-
- self.hidden_act = args.hidden_act
- self.prepostprocess_dropout = args.hidden_dropout_prob
- self.attention_dropout = args.attention_probs_dropout_prob
- self.weight_sharing = args.weight_sharing
-
- self.mem_efficient = args.mem_efficient
-
- self.dtype = "float32"
-
- # Initialize all weigths by truncated normal initializer, and all biases
- # will be initialized by constant zero by default.
- self.param_initializer = fluid.initializer.TruncatedNormal(scale=args.initializer_range)
-
- # task-related
- self.generator = Generator(args)
- self.do_generation = args.do_generation
-
- super(UnifiedTransformer, self).__init__(args, place)
-
- def _gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None):
- token_emb_out = layers.embedding(
- input=token_ids,
- size=[self.vocab_size, self.emb_size],
- dtype=self.dtype,
- param_attr=fluid.ParamAttr(name=self.token_emb_name, initializer=self.param_initializer))
- type_emb_out = layers.embedding(
- input=type_ids,
- size=[self.type_size, self.emb_size],
- dtype=self.dtype,
- param_attr=fluid.ParamAttr(name=self.type_emb_name, initializer=self.param_initializer))
- pos_emb_out = layers.embedding(
- input=pos_ids,
- size=[self.max_position_seq_len, self.emb_size],
- dtype=self.dtype,
- param_attr=fluid.ParamAttr(name=self.pos_emb_name, initializer=self.param_initializer))
- emb_out = token_emb_out + type_emb_out + pos_emb_out
-
- # auxiliary memory embeddings
- if aux_emb is not None:
- emb_out = layers.concat([aux_emb, emb_out], axis=1)
-
- # post process of embedding
- emb_out = pre_process_layer(
- emb_out, self.pre_encoder_cmd, self.prepostprocess_dropout, name="pre_encoder", epsilon=self.epsilon)
- if self.emb_mapping_in:
- emb_out = layers.fc(
- input=emb_out,
- num_flatten_dims=2,
- size=self.hidden_size,
- param_attr=fluid.ParamAttr(name="emb_hidden_mapping", initializer=self.param_initializer),
- bias_attr="emb_hidden_mapping_bias")
-
- # generate n-head self-attention mask
- self_attn_mask = input_mask
- self_attn_mask = layers.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False)
- n_head_self_attn_mask = layers.stack(x=[self_attn_mask] * self.n_head, axis=1)
- n_head_self_attn_mask.stop_gradient = True
-
- return emb_out, n_head_self_attn_mask
-
- def _get_pooled_output(self, enc_out, pos):
- enc_out = layers.reshape(x=enc_out, shape=[-1, self.hidden_size])
- pos = layers.cast(x=pos, dtype="int32")
- feat = layers.gather(input=enc_out, index=pos)
-
- pooled_out = layers.fc(
- input=feat,
- size=self.hidden_size,
- act="tanh",
- param_attr=fluid.ParamAttr(name="pooled_fc.w_0", initializer=self.param_initializer),
- bias_attr="pooled_fc.b_0")
- return pooled_out
-
- def _generation_network(self, token_ids, type_ids, pos_ids, generation_mask, aux_emb=None, gather_idx=None):
- emb_out, n_head_self_attn_mask = self._gen_input(token_ids, type_ids, pos_ids, generation_mask, aux_emb=aux_emb)
- return self._encode(emb_out, n_head_self_attn_mask, self.generation_caches, gather_idx=gather_idx)
-
- def _encode(self, emb_out, n_head_self_attn_mask, caches=None, gather_idx=None):
- return encoder(
- enc_input=emb_out,
- attn_bias=n_head_self_attn_mask,
- n_layer=self.n_layer,
- n_head=self.n_head,
- d_key=self.d_key,
- d_value=self.d_value,
- d_model=self.hidden_size,
- d_inner_hid=self.inner_hidden_size,
- prepostprocess_dropout=self.prepostprocess_dropout,
- attention_dropout=self.attention_dropout,
- relu_dropout=0,
- hidden_act=self.hidden_act,
- preprocess_cmd=self.preprocess_cmd,
- postprocess_cmd=self.postprocess_cmd,
- param_initializer=self.param_initializer,
- epsilon=self.epsilon,
- n_layer_per_block=self.n_layer_per_block,
- name="encoder",
- caches=caches,
- gather_idx=gather_idx,
- store=caches is not None)
-
- def _gumbel_softmax(self, logits, tau=0.67, eps=1e-10):
- u = layers.uniform_random_batch_size_like(logits, shape=[-1, self.latent_type_size], min=0.0, max=1.0)
- u.stop_gradient = True
- gumbel = 0.0 - layers.log(eps - layers.log(u + eps))
- y = logits + gumbel
- return layers.softmax(y / tau)
-
- def _get_feed_dict(self, is_infer=False):
- """
- Get the feed list of the model.
-
- Args:
- is_infer(bool): True if running inference.
-
- Returns:
- list(Variable): The feed list.
- list(str): The name of each Variable in feed list.
- """
- feed_dict = {}
- feed_dict["token_ids"] = layers.data(name="token_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
- feed_dict["type_ids"] = layers.data(name="type_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
- feed_dict["pos_ids"] = layers.data(name="pos_ids", shape=[-1, self.max_seq_len, 1], dtype="int64")
-
- feed_dict["generation_mask"] = layers.data(
- name="generation_mask", shape=[-1, self.max_seq_len, self.max_seq_len], dtype=self.dtype)
-
- if is_infer:
- feed_dict["tgt_ids"] = layers.data(
- name="tgt_ids", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2)
- feed_dict["tgt_pos"] = layers.data(
- name="tgt_pos", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2)
- feed_dict["init_score"] = layers.data(name="init_score", shape=[-1, 1], dtype="float32", lod_level=1)
- feed_dict["parent_idx"] = layers.data(name="parent_idx", shape=[-1], dtype="int64")
-
- feed_dict["tgt_generation_mask"] = layers.data(
- name="tgt_generation_mask", shape=[-1, 1, self.max_seq_len], dtype="float32")
- else:
- feed_dict["tgt_label"] = layers.data(name="tgt_label", shape=[-1, 1], dtype="int64")
- feed_dict["tgt_pos"] = layers.data(name="tgt_pos", shape=[-1, 1], dtype="int64")
-
- feed_dict["data_id"] = layers.data(name="data_id", shape=[-1, 1], dtype="int64")
- return feed_dict
-
- def forward(self, inputs, is_infer=False):
- """
- Run model main forward.
- """
- outputs = {}
- if is_infer:
- self.generation_caches = [{
- "k":
- layers.fill_constant_batch_size_like(
- input=inputs["token_ids"], shape=[-1, 0, self.d_key * self.n_head], dtype=self.dtype, value=0),
- "v":
- layers.fill_constant_batch_size_like(
- input=inputs["token_ids"], shape=[-1, 0, self.d_value * self.n_head], dtype=self.dtype, value=0),
- } for i in range(self.n_layer)]
- else:
- self.generation_caches = None
-
- outputs["enc_out"], generation_checkpoints = self._generation_network(
- token_ids=inputs["token_ids"],
- type_ids=inputs["type_ids"],
- pos_ids=inputs["pos_ids"],
- generation_mask=inputs["generation_mask"],
- gather_idx=inputs.get("parent_idx", None))
-
- if not is_infer:
- outputs["checkpoints"] = generation_checkpoints
- return outputs
-
- def _calc_logits(self, enc_out, checkpoints=None, seq_pos=None):
- """Get the logits of generation."""
- enc_out = layers.reshape(x=enc_out, shape=[-1, self.hidden_size])
- if seq_pos is not None:
- seq_pos = layers.cast(x=seq_pos, dtype="int32")
- seq_feat = layers.gather(input=enc_out, index=seq_pos)
- else:
- seq_feat = enc_out
-
- seq_trans_feat = layers.fc(
- input=seq_feat,
- size=self.emb_size,
- act=self.hidden_act,
- param_attr=fluid.ParamAttr(name="mask_lm_trans_fc.w_0", initializer=self.param_initializer),
- bias_attr=fluid.ParamAttr(name="mask_lm_trans_fc.b_0"))
-
- seq_trans_feat = pre_process_layer(seq_trans_feat, self.post_cls_cmd, name="mask_lm_trans")
-
- if checkpoints is not None:
- checkpoints.append(seq_trans_feat)
-
- if self.weight_sharing:
- fc_out = layers.matmul(
- x=seq_trans_feat,
- y=fluid.default_main_program().global_block().var(self.token_emb_name),
- transpose_y=True)
- if self.cls_bias:
- fc_out += layers.create_parameter(
- shape=[self.vocab_size],
- dtype=self.dtype,
- attr=fluid.ParamAttr(name="mask_lm_out_fc.b_0"),
- is_bias=True)
- else:
- seq_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0") if self.cls_bias else False
- fc_out = layers.fc(
- input=seq_trans_feat,
- size=self.vocab_size,
- param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", initializer=self.param_initializer),
- bias_attr=seq_out_bias_attr)
- return fc_out
-
- def _get_metrics(self, inputs, outputs):
- metrics = {}
-
- fc_out = self._calc_logits(outputs["enc_out"], outputs["checkpoints"], inputs["tgt_pos"])
- tgt_lm_loss = layers.softmax_with_cross_entropy(logits=fc_out, label=inputs["tgt_label"])
- mean_tgt_lm_loss = layers.mean(tgt_lm_loss)
- loss = mean_tgt_lm_loss
- metrics["token_lm_loss"] = mean_tgt_lm_loss
-
- metrics["loss"] = loss
- return metrics
-
- def _get_statistics(self, inputs, outputs):
- statistics = {}
- if "tgt_label" in inputs:
- statistics["tokens_num"] = layers.reduce_sum(
- layers.fill_constant_batch_size_like(input=inputs["tgt_label"], value=1.0, shape=[-1], dtype="int64"))
- statistics["batch_size"] = layers.reduce_sum(
- layers.fill_constant_batch_size_like(input=inputs["token_ids"], value=1.0, shape=[-1], dtype="int64"))
- return statistics
-
- def get_metrics_and_statistics(self, inputs, outputs):
- """
- Get metrics and statistics.
- """
- metrics = self._get_metrics(inputs, outputs)
- statistics = self._get_statistics(inputs, outputs)
- return metrics, statistics
-
- def infer(self, inputs, outputs):
- """
- Run model inference.
- """
- if self.do_generation:
- return self.generator.inference(self, inputs, outputs)
- else:
- raise NotImplementedError
-
- def _run_generation(self, inputs):
- """
- Run generation.
- """
- batch_size = len(inputs["data_id"])
- inputs["parent_idx"] = np.array(range(batch_size), dtype="int64")
- outputs = self._execute(
- self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict, return_numpy=False)
-
- predictions = []
- data_id_list = np.array(outputs["data_id"]).reshape(-1).tolist()
- token_ids_list = np.array(outputs["token_ids"]).squeeze(2).tolist()
- seq_ids = outputs["finished_ids"]
- seq_ids_np = np.array(outputs["finished_ids"])
- seq_scores_np = np.array(outputs["finished_scores"])
- for i, (data_id, token_ids) in enumerate(zip(data_id_list, token_ids_list)):
- start = seq_ids.lod()[0][i]
- end = seq_ids.lod()[0][i + 1]
- for j in range(start, end):
- sub_start = seq_ids.lod()[1][j]
- sub_end = seq_ids.lod()[1][j + 1]
- info = {}
- info["data_id"] = data_id
- info["decode_score"] = float(seq_scores_np[sub_end - 1])
- info["context_token_ids"] = token_ids
- info["response_token_ids"] = seq_ids_np[sub_start:sub_end].tolist()
- predictions.append(info)
- return predictions
-
- def infer_step(self, inputs):
- """
- Run one inference step.
- """
- if self.do_generation:
- if self.generator.num_samples:
- inputs = {
- name: repeat_array_or_tensor(array_or_tensor, self.place, self.generator.num_samples)
- for name, array_or_tensor in inputs.items()
- }
-
- if self.mem_efficient:
- predictions = []
- for idx in range(0, len(inputs["data_id"]), self.batch_size):
- part_inputs = {
- name: slice_array_or_tensor(array_or_tensor, self.place, idx, idx + self.batch_size)
- for name, array_or_tensor in inputs.items()
- }
- part_outputs = self._run_generation(part_inputs)
- predictions.extend(part_outputs)
- else:
- predictions = self._run_generation(inputs)
- return predictions
- else:
- return self._execute(self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict)
diff --git a/modules/text/text_generation/plato2_en_large/module.py b/modules/text/text_generation/plato2_en_large/module.py
index 61e684db..ffc3fa27 100644
--- a/modules/text/text_generation/plato2_en_large/module.py
+++ b/modules/text/text_generation/plato2_en_large/module.py
@@ -12,83 +12,102 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import ast
-import os
-import json
-import sys
import argparse
import contextlib
+import os
+import sys
from collections import namedtuple
-import paddle.fluid as fluid
+import paddle
+import paddle.nn as nn
+
import paddlehub as hub
+from .model import Plato2InferModel
+from .readers.nsp_reader import NSPReader
+from .readers.plato_reader import PlatoReader
+from .utils import gen_inputs
+from .utils.args import parse_args
+from .utils.args import str2bool
+from paddlehub.module.module import moduleinfo
from paddlehub.module.module import runnable
+from paddlehub.module.module import serving
from paddlehub.module.nlp_module import DataFormatError
-from paddlehub.common.logger import logger
-from paddlehub.module.module import moduleinfo, serving
-
-import plato2_en_large.models as plato_models
-from plato2_en_large.tasks.dialog_generation import DialogGeneration
-from plato2_en_large.utils import check_cuda, Timer
-from plato2_en_large.utils.args import parse_args
@moduleinfo(
name="plato2_en_large",
- version="1.0.0",
+ version="1.1.0",
summary=
"A novel pre-training model for dialogue generation, incorporated with latent discrete variables for one-to-many relationship modeling.",
author="baidu-nlp",
author_email="",
type="nlp/text_generation",
)
-class Plato(hub.NLPPredictionModule):
- def _initialize(self):
+class Plato2(nn.Layer, hub.NLPPredictionModule):
+
+ def __init__(self):
"""
initialize with the necessary elements
"""
- if "CUDA_VISIBLE_DEVICES" not in os.environ:
- raise RuntimeError("The module only support GPU. Please set the environment variable CUDA_VISIBLE_DEVICES.")
-
+ super(Plato2, self).__init__()
args = self.setup_args()
- self.task = DialogGeneration(args)
- self.model = plato_models.create_model(args, fluid.CUDAPlace(0))
+
+ if args.num_layers == 24:
+ n_head = 16
+ hidden_size = 1024
+ elif args.num_layers == 32:
+ n_head = 32
+ hidden_size = 2048
+ else:
+ raise ValueError('The pre-trained model only support 24 or 32 layers, '
+ 'but received num_layers=%d.' % args.num_layers)
+
+ self.plato_reader = PlatoReader(args)
+ nsp_reader = NSPReader(args)
+ self.model = Plato2InferModel(nsp_reader, args.num_layers, n_head, hidden_size)
+ state_dict = paddle.load(args.init_from_ckpt)
+ self.model.set_state_dict(state_dict)
+ self.model.eval()
self.Example = namedtuple("Example", ["src", "data_id"])
+ self.latent_type_size = args.latent_type_size
self._interactive_mode = False
def setup_args(self):
"""
Setup arguments.
"""
- assets_path = os.path.join(self.directory, "assets")
- vocab_path = os.path.join(assets_path, "vocab.txt")
- init_pretraining_params = os.path.join(assets_path, "32L", "Plato")
- spm_model_file = os.path.join(assets_path, "spm.model")
- nsp_inference_model_path = os.path.join(assets_path, "32L", "NSP")
- config_path = os.path.join(assets_path, "32L.json")
+ ckpt_path = os.path.join(self.directory, 'assets', '32L.pdparams')
+ vocab_path = os.path.join(self.directory, 'assets', 'vocab.txt')
+ spm_model_file = os.path.join(self.directory, 'assets', 'spm.model')
# ArgumentParser.parse_args use argv[1:], it will drop the first one arg, so the first one in sys.argv should be ""
sys.argv = [
- "", "--model", "Plato", "--vocab_path",
- "%s" % vocab_path, "--do_lower_case", "False", "--init_pretraining_params",
- "%s" % init_pretraining_params, "--spm_model_file",
- "%s" % spm_model_file, "--nsp_inference_model_path",
- "%s" % nsp_inference_model_path, "--ranking_score", "nsp_score", "--do_generation", "True", "--batch_size",
- "1", "--config_path",
- "%s" % config_path
+ "--empty",
+ "--spm_model_file",
+ "%s" % spm_model_file,
+ "--vocab_path",
+ "%s" % vocab_path,
]
parser = argparse.ArgumentParser()
- plato_models.add_cmdline_args(parser)
- DialogGeneration.add_cmdline_args(parser)
- args = parse_args(parser)
+ group = parser.add_argument_group("Model")
+ group.add_argument("--init_from_ckpt", type=str, default=ckpt_path)
+ group.add_argument("--vocab_size", type=int, default=8001)
+ group.add_argument("--latent_type_size", type=int, default=20)
+ group.add_argument("--num_layers", type=int, default=32)
- args.load(args.config_path, "Model")
- args.run_infer = True # only build infer program
+ group = parser.add_argument_group("Task")
+ group.add_argument("--is_cn", type=str2bool, default=False)
+
+ NSPReader.add_cmdline_args(parser)
+
+ args = parse_args(parser)
+ args.batch_size *= args.latent_type_size
return args
@serving
+ @paddle.no_grad()
def generate(self, texts):
"""
Get the robot responses of the input texts.
@@ -113,10 +132,12 @@ class Plato(hub.NLPPredictionModule):
bot_responses = []
for i, text in enumerate(texts):
- example = self.Example(src=text.replace("\t", " [SEP] "), data_id=i)
- record = self.task.reader._convert_example_to_record(example, is_infer=True)
- data = self.task.reader._pad_batch_records([record], is_infer=True)
- pred = self.task.infer_step(self.model, data)[0] # batch_size is 1
+ example = self.Example(src=text.replace("\t", " [SEP] "), data_id=0)
+ record = self.plato_reader._convert_example_to_record(example, is_infer=True)
+ data = self.plato_reader._pad_batch_records([record], is_infer=True)
+ inputs = gen_inputs(data, self.latent_type_size)
+ inputs['tgt_ids'] = inputs['tgt_ids'].astype('int64')
+ pred = self.model(inputs)[0] # batch_size is 1
bot_response = pred["response"] # ignore data_id and score
bot_responses.append(bot_response)
@@ -144,11 +165,10 @@ class Plato(hub.NLPPredictionModule):
"""
Run as a command
"""
- self.parser = argparse.ArgumentParser(
- description='Run the %s module.' % self.name,
- prog='hub run %s' % self.name,
- usage='%(prog)s',
- add_help=True)
+ self.parser = argparse.ArgumentParser(description='Run the %s module.' % self.name,
+ prog='hub run %s' % self.name,
+ usage='%(prog)s',
+ add_help=True)
self.arg_input_group = self.parser.add_argument_group(title="Input options", description="Input data. Required")
self.arg_config_group = self.parser.add_argument_group(
@@ -167,14 +187,3 @@ class Plato(hub.NLPPredictionModule):
results = self.generate(texts=input_data)
return results
-
-
-if __name__ == "__main__":
- module = Plato()
- for result in module.generate(["Hello", "Hello\thi, nice to meet you, my name is tom\tso your name is tom?"]):
- print(result)
- with module.interactive_mode(max_turn=3):
- while True:
- human_utterance = input()
- robot_utterance = module.generate(human_utterance)
- print("Robot: %s" % robot_utterance[0])
diff --git a/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py b/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py
index 73be362f..ed645e9d 100644
--- a/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py
+++ b/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py
@@ -17,16 +17,13 @@ import csv
from collections import namedtuple
from contextlib import contextmanager
import gzip
-import os
import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.incubate.fleet.collective import fleet
-from plato2_en_large.utils import pad_batch_data
-from plato2_en_large.utils.args import str2bool
-from plato2_en_large.utils.masking import mask
-import plato2_en_large.utils.tokenization as tokenization
+from ..utils import pad_batch_data
+from ..utils.args import str2bool
+from ..utils.masking import mask
+from ..utils import tokenization
class DialogReader(object):
@@ -38,9 +35,17 @@ class DialogReader(object):
group = parser.add_argument_group("Reader")
group.add_argument("--max_src_len", type=int, default=128)
group.add_argument("--max_tgt_len", type=int, default=128)
- group.add_argument("--truncate_first_turn", type=str2bool, default=False)
- group.add_argument("--file_format", type=str, default="file", choices=["file", "filelist"])
- group.add_argument("--data_format", type=str, default="raw", choices=["raw", "tokenized", "numerical"])
+ group.add_argument("--truncate_first_turn",
+ type=str2bool,
+ default=False)
+ group.add_argument("--file_format",
+ type=str,
+ default="file",
+ choices=["file", "filelist"])
+ group.add_argument("--data_format",
+ type=str,
+ default="raw",
+ choices=["raw", "tokenized", "numerical"])
group.add_argument("--in_tokens", type=str2bool, default=False)
group.add_argument("--batch_size", type=int, default=16)
group.add_argument("--continuous_position", type=str2bool, default=True)
@@ -48,7 +53,9 @@ class DialogReader(object):
group.add_argument("--sort_pool_size", type=int, default=2**16)
group = parser.add_argument_group("Tokenizer")
- group.add_argument("--tokenizer", type=str, default="SentencePieceTokenizer")
+ group.add_argument("--tokenizer",
+ type=str,
+ default="SentencePieceTokenizer")
args, _ = parser.parse_known_args()
tokenizer_cls = getattr(tokenization, args.tokenizer)
tokenizer_cls.add_cmdline_args(parser)
@@ -89,7 +96,9 @@ class DialogReader(object):
self.fields += ["tgt_start_idx", "data_id"]
self.sort_key = lambda record: [len(record.token_ids)]
- self.Record = namedtuple("Record", self.fields, defaults=(None, ) * len(self.fields))
+ self.Record = namedtuple("Record",
+ self.fields,
+ defaults=(None, ) * len(self.fields))
self.features = {}
return
@@ -113,7 +122,9 @@ class DialogReader(object):
else:
s_tokens = self.tokenizer.tokenize(s)
- s_token_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + [self.eos_id]
+ s_token_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + [
+ self.eos_id
+ ]
s_token_ids_list.append(s_token_ids)
# trim src
@@ -123,9 +134,12 @@ class DialogReader(object):
total_token_num += len(s_token_ids_list[idx])
if total_token_num > self.max_src_len:
if self.truncate_first_turn and idx == 0:
- truncated_ids = s_token_ids_list[idx][:self.max_src_len - total_token_num]
+ truncated_ids = s_token_ids_list[idx][:self.max_src_len -
+ total_token_num]
if len(truncated_ids) > 1:
- s_token_ids_list[idx] = truncated_ids[:-1] + [self.eos_id]
+ s_token_ids_list[idx] = truncated_ids[:-1] + [
+ self.eos_id
+ ]
idx -= 1
break
idx -= 1
@@ -180,7 +194,11 @@ class DialogReader(object):
tgt_pos_ids = list(range(len(tgt_token_ids)))
pos_ids = list(range(len(token_ids)))
- field_values = {"token_ids": src_token_ids, "type_ids": src_type_ids, "pos_ids": src_pos_ids}
+ field_values = {
+ "token_ids": src_token_ids,
+ "type_ids": src_type_ids,
+ "pos_ids": src_pos_ids
+ }
field_values["tgt_start_idx"] = tgt_start_idx
field_values["data_id"] = example.data_id
@@ -204,7 +222,8 @@ class DialogReader(object):
def _read_numerical_file(self, fp, delimiter=";"):
for i, line in enumerate(fp):
- cols = tokenization.convert_to_unicode(line).strip().split(delimiter)
+ cols = tokenization.convert_to_unicode(line).strip().split(
+ delimiter)
cols = list(map(lambda x: list(map(int, x.split(" "))), cols))
if len(cols) > self.num_numerical_fields:
cols = cols[:self.num_numerical_fields]
@@ -213,6 +232,7 @@ class DialogReader(object):
yield record
def _read_file(self, input_file, phase, is_infer):
+
def __wrapper__():
with open_file(input_file) as fp:
if self.data_format == "numerical":
@@ -237,13 +257,18 @@ class DialogReader(object):
if phase == "train":
self.current_file_index = file_index
self.current_file = input_file
- file_reader = self._read_file(input_file.strip(), phase, is_infer)
+ file_reader = self._read_file(input_file.strip(), phase,
+ is_infer)
for record in file_reader():
yield record
return __wrapper__
- def _batch_reader(self, reader, phase=None, is_infer=False, sort_pool_size=2**16):
+ def _batch_reader(self,
+ reader,
+ phase=None,
+ is_infer=False,
+ sort_pool_size=2**16):
"""Construct a batch reader."""
def update_max_lens(max_lens, record):
@@ -251,7 +276,10 @@ class DialogReader(object):
if max_lens is None:
return self.sort_key(record)
else:
- return [max(max_len, l) for max_len, l in zip(max_lens, self.sort_key(record))]
+ return [
+ max(max_len, l)
+ for max_len, l in zip(max_lens, self.sort_key(record))
+ ]
def get_batch(reader):
"""Generate batches from reader."""
@@ -265,7 +293,8 @@ class DialogReader(object):
self.current_example += 1
max_lens = update_max_lens(max_lens, record)
if self.in_tokens:
- to_append = (len(batch) + 1) * sum(max_lens) <= self.batch_size
+ to_append = (len(batch) +
+ 1) * sum(max_lens) <= self.batch_size
else:
to_append = len(batch) < self.batch_size
if to_append:
@@ -286,7 +315,8 @@ class DialogReader(object):
self.current_example += 1
max_lens = update_max_lens(max_lens, record)
if self.in_tokens:
- to_append = (len(batch) + 1) * sum(max_lens) <= self.batch_size
+ to_append = (len(batch) +
+ 1) * sum(max_lens) <= self.batch_size
else:
to_append = len(batch) < self.batch_size
if to_append:
@@ -320,7 +350,12 @@ class DialogReader(object):
return __wrapper__
- def _distributed_batch_reader(self, batch_reader, num_part, part_id, is_test=False):
+ def _distributed_batch_reader(self,
+ batch_reader,
+ num_part,
+ part_id,
+ is_test=False):
+
def __wrapper__():
batches = []
for batch in batch_reader():
@@ -351,7 +386,8 @@ class DialogReader(object):
nonlocal reader
if reader is None:
if self.file_format == "filelist":
- reader = self._read_files(input_file, phase, is_infer, not phase.endswith("test"))
+ reader = self._read_files(input_file, phase, is_infer,
+ not phase.endswith("test"))
else:
if phase == "train":
self.total_file = 1
@@ -360,11 +396,18 @@ class DialogReader(object):
reader = self._read_file(input_file, phase, is_infer)
batch_reader = self._batch_reader(
- reader, phase, is_infer, sort_pool_size=self.sort_pool_size if not is_infer else 0)
+ reader,
+ phase,
+ is_infer,
+ sort_pool_size=self.sort_pool_size if not is_infer else 0)
if phase == "train":
- batch_reader = self._distributed_batch_reader(batch_reader, num_part, part_id)
+ batch_reader = self._distributed_batch_reader(
+ batch_reader, num_part, part_id)
elif phase.startswith("distributed"):
- batch_reader = self._distributed_batch_reader(batch_reader, num_part, part_id, is_test=True)
+ batch_reader = self._distributed_batch_reader(batch_reader,
+ num_part,
+ part_id,
+ is_test=True)
for epoch_index in range(num_epochs):
if phase == "train":
@@ -375,20 +418,28 @@ class DialogReader(object):
return __wrapper__
- def _gen_self_attn_mask(self, batch_token_ids, batch_tgt_start_idx=None, is_unidirectional=True, shift_len=0):
+ def _gen_self_attn_mask(self,
+ batch_token_ids,
+ batch_tgt_start_idx=None,
+ is_unidirectional=True,
+ shift_len=0):
max_len = max(map(len, batch_token_ids))
- input_mask_data = np.zeros((len(batch_token_ids), max_len + shift_len, max_len + shift_len))
+ input_mask_data = np.zeros(
+ (len(batch_token_ids), max_len + shift_len, max_len + shift_len))
if is_unidirectional:
for index, mask_data in enumerate(input_mask_data):
- start = 0 if batch_tgt_start_idx is None else batch_tgt_start_idx[index]
+ start = 0 if batch_tgt_start_idx is None else batch_tgt_start_idx[
+ index]
end = len(batch_token_ids[index])
mask_data[:end + shift_len, :start + shift_len] = 1.0
# Generate the lower triangular matrix using the slice of matrix
b = np.tril(np.ones([end - start, end - start]), 0)
- mask_data[start + shift_len:end + shift_len, start + shift_len:end + shift_len] = b
+ mask_data[start + shift_len:end + shift_len,
+ start + shift_len:end + shift_len] = b
else:
for index, token_ids in enumerate(batch_token_ids):
- input_mask_data[index, :len(token_ids) + shift_len, :len(token_ids) + shift_len] = 1.0
+ input_mask_data[index, :len(token_ids) +
+ shift_len, :len(token_ids) + shift_len] = 1.0
return input_mask_data.astype("float32")
def _pad_batch_records(self, batch_records, is_infer):
@@ -405,20 +456,24 @@ class DialogReader(object):
batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id)
batch_tgt_start_idx = [record.tgt_start_idx for record in batch_records]
- batch["generation_mask"] = self._gen_self_attn_mask(batch_token_ids, batch_tgt_start_idx=batch_tgt_start_idx)
+ batch["generation_mask"] = self._gen_self_attn_mask(
+ batch_token_ids, batch_tgt_start_idx=batch_tgt_start_idx)
if is_infer:
- tgt_ids = np.array([[[self.bos_id]]] * len(batch_token_ids), dtype="int64")
+ tgt_ids = np.array([[[self.bos_id]]] * len(batch_token_ids),
+ dtype="int64")
if self.continuous_position:
tgt_pos = np.array(batch_tgt_start_idx, dtype="int64")
else:
tgt_pos = np.zeros_like(batch_tgt_start_idx, dtype="int64")
tgt_pos = tgt_pos.reshape(-1, 1, 1)
- batch["init_score"] = np.zeros_like(tgt_ids, dtype="float32").reshape(-1, 1).tolist()
+ batch["init_score"] = np.zeros_like(
+ tgt_ids, dtype="float32").reshape(-1, 1).tolist()
batch["tgt_ids"] = tgt_ids.tolist()
batch["tgt_pos"] = tgt_pos.tolist()
- batch["tgt_generation_mask"] = batch["generation_mask"][:, 0:1, :].astype("float32")
+ batch["tgt_generation_mask"] = batch[
+ "generation_mask"][:, 0:1, :].astype("float32")
else:
batch["tgt_label"], batch["tgt_pos"] = mask(
batch_tokens=batch_token_ids,
@@ -427,7 +482,8 @@ class DialogReader(object):
is_unidirectional=True)
batch_data_id = [record.data_id for record in batch_records]
- batch["data_id"] = np.array(batch_data_id).astype("int64").reshape([-1, 1])
+ batch["data_id"] = np.array(batch_data_id).astype("int64").reshape(
+ [-1, 1])
return batch
diff --git a/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py b/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py
index 8258ed30..b13ce465 100644
--- a/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py
+++ b/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py
@@ -17,10 +17,10 @@ from collections import namedtuple
import numpy as np
-from plato2_en_large.readers.dialog_reader import DialogReader
-from plato2_en_large.utils import pad_batch_data
-from plato2_en_large.utils.args import str2bool
-from plato2_en_large.utils.masking import mask
+from .dialog_reader import DialogReader
+from ..utils import pad_batch_data
+from ..utils.args import str2bool
+from ..utils.masking import mask
class NSPReader(DialogReader):
@@ -30,27 +30,35 @@ class NSPReader(DialogReader):
def add_cmdline_args(cls, parser):
"""Add cmdline argurments."""
group = DialogReader.add_cmdline_args(parser)
- group.add_argument(
- "--attention_style", type=str, default="bidirectional", choices=["bidirectional", "unidirectional"])
- group.add_argument("--mix_negative_sample", type=str2bool, default=False)
+ group.add_argument("--attention_style",
+ type=str,
+ default="bidirectional",
+ choices=["bidirectional", "unidirectional"])
+ group.add_argument("--mix_negative_sample",
+ type=str2bool,
+ default=False)
return group
def __init__(self, args):
super(NSPReader, self).__init__(args)
self.fields.append("label")
- self.Record = namedtuple("Record", self.fields, defaults=(None, ) * len(self.fields))
+ self.Record = namedtuple("Record",
+ self.fields,
+ defaults=(None, ) * len(self.fields))
self.attention_style = args.attention_style
self.mix_negative_sample = args.mix_negative_sample
return
def _convert_example_to_record(self, example, is_infer):
- record = super(NSPReader, self)._convert_example_to_record(example, False)
+ record = super(NSPReader,
+ self)._convert_example_to_record(example, False)
if "label" in example._fields:
record = record._replace(label=int(example.label))
return record
def _mix_negative_sample(self, reader, neg_pool_size=2**16):
+
def gen_from_pool(pool):
num_samples = len(pool)
if num_samples == 1:
@@ -64,10 +72,16 @@ class NSPReader(DialogReader):
idx_i = pool[i].tgt_start_idx
idx_j = pool[j].tgt_start_idx
field_values = {}
- field_values["token_ids"] = pool[i].token_ids[:idx_i] + pool[j].token_ids[idx_j:]
- field_values["type_ids"] = pool[i].type_ids[:idx_i] + pool[j].type_ids[idx_j:]
- field_values["pos_ids"] = list(range(len(field_values["token_ids"])))
- neg_record = self.Record(**field_values, tgt_start_idx=idx_i, data_id=-1, label=0)
+ field_values["token_ids"] = pool[i].token_ids[:idx_i] + pool[
+ j].token_ids[idx_j:]
+ field_values["type_ids"] = pool[i].type_ids[:idx_i] + pool[
+ j].type_ids[idx_j:]
+ field_values["pos_ids"] = list(
+ range(len(field_values["token_ids"])))
+ neg_record = self.Record(**field_values,
+ tgt_start_idx=idx_i,
+ data_id=-1,
+ label=0)
pool.append(neg_record)
assert len(neg_record.token_ids) <= self.max_seq_len
self.global_rng.shuffle(pool)
@@ -88,11 +102,18 @@ class NSPReader(DialogReader):
return __wrapper__
- def _batch_reader(self, reader, phase=None, is_infer=False, sort_pool_size=2**16):
+ def _batch_reader(self,
+ reader,
+ phase=None,
+ is_infer=False,
+ sort_pool_size=2**16):
if self.mix_negative_sample:
reader = self._mix_negative_sample(reader)
- return super(NSPReader, self)._batch_reader(
- reader, phase=phase, is_infer=is_infer, sort_pool_size=sort_pool_size)
+ return super(NSPReader,
+ self)._batch_reader(reader,
+ phase=phase,
+ is_infer=is_infer,
+ sort_pool_size=sort_pool_size)
def _pad_batch_records(self, batch_records, is_infer):
"""
@@ -106,8 +127,10 @@ class NSPReader(DialogReader):
batch_label = [record.label for record in batch_records]
if self.attention_style == "unidirectional":
- batch["token_ids"] = pad_batch_data(batch_token_ids, pad_id=self.pad_id)
- batch["type_ids"] = pad_batch_data(batch_type_ids, pad_id=self.pad_id)
+ batch["token_ids"] = pad_batch_data(batch_token_ids,
+ pad_id=self.pad_id)
+ batch["type_ids"] = pad_batch_data(batch_type_ids,
+ pad_id=self.pad_id)
batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id)
tgt_label, tgt_pos, label_pos = mask(
batch_tokens=batch_token_ids,
@@ -116,7 +139,8 @@ class NSPReader(DialogReader):
sent_b_starts=batch_tgt_start_idx,
labels=batch_label,
is_unidirectional=True)
- attention_mask = self._gen_self_attn_mask(batch_token_ids, batch_tgt_start_idx)
+ attention_mask = self._gen_self_attn_mask(batch_token_ids,
+ batch_tgt_start_idx)
else:
batch_mask_token_ids, tgt_label, tgt_pos, label_pos = mask(
batch_tokens=batch_token_ids,
@@ -129,10 +153,13 @@ class NSPReader(DialogReader):
is_unidirectional=False)
if not is_infer:
batch_token_ids = batch_mask_token_ids
- batch["token_ids"] = pad_batch_data(batch_token_ids, pad_id=self.pad_id)
- batch["type_ids"] = pad_batch_data(batch_type_ids, pad_id=self.pad_id)
+ batch["token_ids"] = pad_batch_data(batch_token_ids,
+ pad_id=self.pad_id)
+ batch["type_ids"] = pad_batch_data(batch_type_ids,
+ pad_id=self.pad_id)
batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id)
- attention_mask = self._gen_self_attn_mask(batch_token_ids, is_unidirectional=False)
+ attention_mask = self._gen_self_attn_mask(batch_token_ids,
+ is_unidirectional=False)
batch["attention_mask"] = attention_mask
batch["label_pos"] = label_pos
@@ -144,5 +171,6 @@ class NSPReader(DialogReader):
batch["tgt_pos"] = tgt_pos
batch_data_id = [record.data_id for record in batch_records]
- batch["data_id"] = np.array(batch_data_id).astype("int64").reshape([-1, 1])
+ batch["data_id"] = np.array(batch_data_id).astype("int64").reshape(
+ [-1, 1])
return batch
diff --git a/modules/text/text_generation/plato2_en_large/readers/plato_reader.py b/modules/text/text_generation/plato2_en_large/readers/plato_reader.py
index 8502f817..a72b1c89 100644
--- a/modules/text/text_generation/plato2_en_large/readers/plato_reader.py
+++ b/modules/text/text_generation/plato2_en_large/readers/plato_reader.py
@@ -15,9 +15,9 @@
import numpy as np
-from plato2_en_large.readers.dialog_reader import DialogReader
-from plato2_en_large.utils import pad_batch_data
-from plato2_en_large.utils.masking import mask
+from .dialog_reader import DialogReader
+from ..utils import pad_batch_data
+from ..utils.masking import mask
class PlatoReader(DialogReader):
@@ -47,9 +47,13 @@ class PlatoReader(DialogReader):
batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id)
batch["generation_mask"] = self._gen_self_attn_mask(
- batch_token_ids, batch_tgt_start_idx=batch_tgt_start_idx, is_unidirectional=True, shift_len=1)
+ batch_token_ids,
+ batch_tgt_start_idx=batch_tgt_start_idx,
+ is_unidirectional=True,
+ shift_len=1)
if not is_infer:
- batch["recognition_mask"] = self._gen_self_attn_mask(batch_token_ids, is_unidirectional=False, shift_len=1)
+ batch["recognition_mask"] = self._gen_self_attn_mask(
+ batch_token_ids, is_unidirectional=False, shift_len=1)
if is_infer:
tgt_ids = np.array([[[self.bos_id]]] * batch_size, dtype="int64")
@@ -58,20 +62,21 @@ class PlatoReader(DialogReader):
else:
tgt_pos = np.zeros_like(batch_tgt_start_idx, dtype="int64")
tgt_pos = tgt_pos.reshape(-1, 1, 1)
- batch["init_score"] = np.zeros_like(tgt_ids, dtype="float32").reshape(-1, 1).tolist()
+ batch["init_score"] = np.zeros_like(
+ tgt_ids, dtype="float32").reshape(-1, 1).tolist()
batch["tgt_ids"] = tgt_ids.tolist()
batch["tgt_pos"] = tgt_pos.tolist()
batch["parent_idx"] = np.array(range(batch_size), dtype="int32")
- batch["tgt_generation_mask"] = batch["generation_mask"][:, 0:1, :].astype("float32")
+ batch["tgt_generation_mask"] = batch[
+ "generation_mask"][:, 0:1, :].astype("float32")
else:
- mask_return_list = mask(
- batch_tokens=batch_token_ids,
- vocab_size=self.vocab_size,
- sent_b_starts=batch_tgt_start_idx,
- is_unidirectional=True,
- use_latent=True,
- use_bow=self.use_bow)
+ mask_return_list = mask(batch_tokens=batch_token_ids,
+ vocab_size=self.vocab_size,
+ sent_b_starts=batch_tgt_start_idx,
+ is_unidirectional=True,
+ use_latent=True,
+ use_bow=self.use_bow)
batch["tgt_label"] = mask_return_list[0]
batch["tgt_pos"] = mask_return_list[1]
if self.use_bow:
@@ -79,5 +84,6 @@ class PlatoReader(DialogReader):
batch["bow_pos"] = mask_return_list[3]
batch_data_id = [record.data_id for record in batch_records]
- batch["data_id"] = np.array(batch_data_id).astype("int64").reshape([-1, 1])
+ batch["data_id"] = np.array(batch_data_id).astype("int64").reshape(
+ [-1, 1])
return batch
diff --git a/modules/text/text_generation/plato2_en_large/tasks/__init__.py b/modules/text/text_generation/plato2_en_large/tasks/__init__.py
deleted file mode 100644
index f81c1d18..00000000
--- a/modules/text/text_generation/plato2_en_large/tasks/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Define task."""
-
-from .task_base import Task
-
-TASK_REGISTRY = {}
-
-__all__ = ["TASK_REGISTRY", "register_task", "create_task", "add_cmdline_args"]
-
-
-def register_task(name):
- """
- Register a new task class.
- """
-
- def __wrapped__(cls):
- if name in TASK_REGISTRY:
- raise ValueError(f"Cannot register duplicate task ({name})")
- if not issubclass(cls, Task):
- raise ValueError(f"Task ({name}: {cls.__name__}) must extend Task")
- TASK_REGISTRY[name] = cls
- return cls
-
- return __wrapped__
-
-
-def create_task(args) -> Task:
- """
- Create a task.
- """
- return TASK_REGISTRY[args.task](args)
-
-
-def add_cmdline_args(parser):
- """
- Add cmdline argument of Task.
- """
- group = parser.add_argument_group("Task")
- group.add_argument("--task", type=str, required=True)
-
- args, _ = parser.parse_known_args()
- if args.task not in TASK_REGISTRY:
- raise ValueError(f"Unknown task type: {args.task}")
- TASK_REGISTRY[args.task].add_cmdline_args(parser)
- return group
-
-
-import plato2_en_large.tasks.dialog_generation
-import plato2_en_large.tasks.next_sentence_prediction
diff --git a/modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py b/modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py
deleted file mode 100644
index 5f425846..00000000
--- a/modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Dialogue generation task."""
-
-from collections import defaultdict
-import math
-
-from plato2_en_large.readers.dialog_reader import DialogReader
-from plato2_en_large.readers.plato_reader import PlatoReader
-from plato2_en_large.tasks import register_task
-from plato2_en_large.tasks.task_base import Task
-from plato2_en_large.utils.args import str2bool
-from plato2_en_large.utils.inference import create_predictor
-
-
-def post_process_context(token_ids, reader, merge=True):
- """Post-process the context sequence."""
- context = []
- utt = []
- for tok_id in token_ids[1:]:
- if tok_id == reader.eos_id:
- utt = reader.tokenizer.convert_ids_to_tokens(utt)
- if merge:
- utt = reader.tokenizer.merge_subword(utt)
- context.append(utt)
- utt = []
- else:
- utt.append(tok_id)
- return context
-
-
-def post_process_response(token_ids, reader, merge=True):
- """
- Post-process the decoded sequence. Truncate from the first
- and remove the and tokens currently.
- """
- eos_pos = len(token_ids)
- for i, tok_id in enumerate(token_ids):
- if tok_id == reader.eos_id:
- eos_pos = i
- break
- token_ids = token_ids[1:eos_pos]
- response = reader.tokenizer.convert_ids_to_tokens(token_ids)
- if merge:
- response = reader.tokenizer.merge_subword(response)
- return token_ids, response
-
-
-def get_cross_turn_repetition(context, pred_tokens, eos_idx, is_cn=False):
- """Get cross-turn repetition."""
- if len(pred_tokens) == 0:
- return 1.0
- if is_cn:
- context = ["".join(utt) for utt in context]
- pred_tokens = "".join(pred_tokens)
-
- pred_tri_grams = set()
- for i in range(len(pred_tokens) - 2):
- tri_gram = tuple(pred_tokens[i:i + 3])
- pred_tri_grams.add(tri_gram)
- for utt in context:
- for i in range(len(utt) - 2):
- tri_gram = tuple(utt[i:i + 3])
- if tri_gram in pred_tri_grams:
- return 1.0
- return 0.0
-
-
-def get_in_turn_repetition(pred, is_cn=False):
- """Get in-turn repetition."""
- if len(pred) == 0:
- return 1.0
- if isinstance(pred[0], str):
- pred = [tok.lower() for tok in pred]
- if is_cn:
- pred = "".join(pred)
- tri_grams = set()
- for i in range(len(pred) - 2):
- tri_gram = tuple(pred[i:i + 3])
- if tri_gram in tri_grams:
- return 1.0
- tri_grams.add(tri_gram)
- return 0.0
-
-
-def get_nsp_score_batch(nsp_predictor, predictions):
- """
- Get NSP scores of a batch.
- """
- import argparse
- from collections import namedtuple
-
- from plato2_en_large.readers.nsp_reader import NSPReader
- from plato2_en_large.utils.args import parse_args
- from plato2_en_large.tasks.next_sentence_prediction import NextSentencePrediction
-
- parser = argparse.ArgumentParser()
- NextSentencePrediction.add_cmdline_args(parser)
- parser.add_argument("--num_samples", type=int, default=None)
- parser.add_argument("--config_path", type=str, required=True)
- parser.add_argument("--mem_efficient", type=str2bool, default=False)
-
- args = parse_args(parser, allow_unknown=True)
- args.load(args.config_path)
- if not args.mem_efficient:
- if args.num_samples:
- args.batch_size *= args.num_samples
- if args.latent_type_size:
- args.batch_size *= args.latent_type_size
- args.tokenized_input = True
- reader = NSPReader(args)
-
- def __reader__():
- headers = ["src", "tgt", "data_id"]
-
- Example = namedtuple("Example", headers)
-
- for i, info in enumerate(predictions):
- context = post_process_context(info["context_token_ids"], reader, merge=False)
- context_tokenized_input = " [SEP] ".join(" ".join(utt) for utt in context)
- _, response = post_process_response(info["response_token_ids"], reader, merge=False)
- response_tokenized_input = " ".join(response)
- example = Example(src=context_tokenized_input, tgt=response_tokenized_input, data_id=i)
- record = reader._convert_example_to_record(example, is_infer=True)
- yield record
- return
-
- generator = reader.data_generator(
- reader=__reader__,
- is_infer=True,
- phase="test",
- )
-
- steps = 0
- for data in generator():
- outputs = nsp_predictor(data)
- for probs, data_id in zip(outputs[0], outputs[-1]):
- data_id = data_id[0]
- info = predictions[data_id]
- info["nsp_score"] = float(probs[1])
-
- return
-
-
-@register_task("DialogGeneration")
-class DialogGeneration(Task):
- """
- Define dialogue response generation.
- """
-
- @classmethod
- def add_cmdline_args(cls, parser):
- """Add cmdline argurments."""
- group = parser.add_argument_group("Task")
- group.add_argument("--do_generation", type=str2bool, default=False)
- group.add_argument("--is_cn", type=str2bool, default=False)
-
- group.add_argument("--nsp_inference_model_path", type=str, default=None)
- group.add_argument("--nsp_attention_style", type=str, default="bidirectional")
-
- group.add_argument("--ranking_score", type=str, default="decode_score")
-
- args, _ = parser.parse_known_args()
- if args.model == "Plato":
- PlatoReader.add_cmdline_args(parser)
- else:
- DialogReader.add_cmdline_args(parser)
- return group
-
- def __init__(self, args):
- super(DialogGeneration, self).__init__(args)
- self.do_generation = args.do_generation
- self.is_cn = args.is_cn
- if args.model == "Plato":
- self.reader = PlatoReader(args)
- else:
- self.reader = DialogReader(args)
-
- if args.nsp_inference_model_path:
- self.nsp_predictor = create_predictor(args.nsp_inference_model_path, args.is_distributed)
- self.nsp_attention_style = args.nsp_attention_style
- else:
- self.nsp_predictor = None
-
- self.ranking_score = args.ranking_score
- self.max_dec_len = args.max_dec_len
- return
-
- def _post_process_generation_output(self, predictions):
- """
- Post process generation output.
-
- Calculate repetion, reranking.
- """
- for info in predictions:
- tokens = post_process_context(info["context_token_ids"], self.reader)
- pred_token_ids, pred_tokens = post_process_response(info["response_token_ids"], self.reader)
- info["context"] = " [SEP] ".join(" ".join(u) for u in tokens)
- info["response"] = " ".join(pred_tokens)
- info["num_token"] = len(pred_token_ids)
- info["cross_turn_repetition"] = get_cross_turn_repetition(tokens, pred_tokens, self.reader.eos_id,
- self.is_cn)
- info["in_turn_repetition"] = max(
- get_in_turn_repetition(pred_tokens, self.is_cn), get_in_turn_repetition(pred_token_ids))
- if self.nsp_predictor is not None:
- get_nsp_score_batch(self.nsp_predictor, predictions)
-
- group = defaultdict(list)
- for info in predictions:
- group[info["data_id"]].append(info)
-
- predictions = []
- for data_id in group:
- infos = group[data_id]
- for info in infos:
- info["score"] = info[self.ranking_score]
- if self.max_dec_len is not None and info["num_token"] >= self.max_dec_len: # not ending
- info["score"] -= 1e3
- elif info["cross_turn_repetition"] > 0:
- info["score"] -= 1e3
- elif info["in_turn_repetition"] > 0:
- info["score"] -= 1e3
- infos = sorted(infos, key=lambda info: -info["score"])
- pred = infos[0]
- keep_attr = ["data_id", "score", "response"]
- pred = {k: pred[k] for k in keep_attr}
- predictions.append(pred)
- return predictions
-
- def _post_process_scoring_output(self, predictions):
- raise NotImplementedError
-
- def _post_process_infer_output(self, predictions):
- if self.do_generation:
- return self._post_process_generation_output(predictions)
- else:
- return self._post_process_scoring_output(predictions)
-
- def merge_mertrics_and_statistics(self, outputs, part_outputs):
- """
- Merge two evaulation output.
- """
- if outputs is None:
- return part_outputs
-
- if part_outputs is None:
- return outputs
-
- batch_size = outputs.pop("batch_size")
- tokens_num = outputs.pop("tokens_num")
- part_batch_size = part_outputs.pop("batch_size")
- part_tokens_num = part_outputs.pop("tokens_num")
-
- new_outputs = {"batch_size": batch_size + part_batch_size, "tokens_num": tokens_num + part_tokens_num}
- for k in outputs:
- if k.startswith("token_"):
- new_outputs[k] = (
- outputs[k] * tokens_num + part_outputs[k] * part_tokens_num) / new_outputs["tokens_num"]
- else:
- new_outputs[k] = (
- outputs[k] * batch_size + part_outputs[k] * part_batch_size) / new_outputs["batch_size"]
- return new_outputs
-
- def get_metrics(self, outputs):
- """
- Get metrics.
- """
- if outputs is None:
- raise ValueError("metrics is None")
- outputs = dict(outputs)
- outputs.pop("batch_size", None)
- outputs.pop("tokens_num", None)
- metrics = {}
- for k in outputs:
- if k.startswith("token_"):
- metrics[k[6:]] = outputs[k]
- else:
- metrics[k] = outputs[k]
- if k == "token_lm_loss":
- metrics["ppl"] = math.exp(outputs[k])
- return metrics
diff --git a/modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py b/modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py
deleted file mode 100644
index 0f0d65d4..00000000
--- a/modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Next sentence prediction task."""
-
-from plato2_en_large.readers.nsp_reader import NSPReader
-from plato2_en_large.tasks import register_task
-from plato2_en_large.tasks.task_base import Task
-from plato2_en_large.utils.args import str2bool
-
-
-@register_task("NextSentencePrediction")
-class NextSentencePrediction(Task):
- """
- Define dialogue response generation.
- """
-
- @classmethod
- def add_cmdline_args(cls, parser):
- """Add cmdline argurments."""
- group = NSPReader.add_cmdline_args(parser)
- return group
-
- def __init__(self, args):
- super(NextSentencePrediction, self).__init__(args)
- self.reader = NSPReader(args)
- return
-
- def _post_process_infer_output(self, predictions):
- predictions = [{
- "data_id": data_id.tolist()[0],
- "score": score.tolist()[1]
- } for data_id, score in zip(predictions["data_id"], predictions["scores"])]
- return predictions
diff --git a/modules/text/text_generation/plato2_en_large/tasks/task_base.py b/modules/text/text_generation/plato2_en_large/tasks/task_base.py
deleted file mode 100644
index dc4bbe44..00000000
--- a/modules/text/text_generation/plato2_en_large/tasks/task_base.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Task base."""
-
-from abc import abstractmethod, ABC
-
-from plato2_en_large.models.model_base import Model
-
-
-class Task(ABC):
- """
- Basic task.
- """
-
- def __init__(self, args):
- return
-
- def train_step(self, model: Model, inputs):
- """Run one training step."""
- outputs = model.train_step(inputs)
- outputs = {k: v.tolist()[0] for k, v in outputs.items()}
- return outputs
-
- def eval_step(self, model: Model, inputs):
- """Run one evaluation step"""
- outputs = model.eval_step(inputs)
- outputs = {k: v.tolist()[0] for k, v in outputs.items()}
- return outputs
-
- def infer_step(self, model: Model, inputs):
- """Run one inference step."""
- predictions = model.infer_step(inputs)
- outputs = self._post_process_infer_output(predictions)
- return outputs
-
- def _post_process_infer_output(self, predictions):
- """
- Post-process inference output.
- """
- return predictions
-
- def merge_mertrics_and_statistics(self, outputs, part_outputs):
- """
- Merge metrics and statistics.
- """
- if outputs is None:
- return part_outputs
-
- if part_outputs is None:
- return outputs
-
- batch_size = outputs.pop("batch_size")
- part_batch_size = part_outputs.pop("batch_size")
-
- new_outputs = {
- "batch_size": batch_size + part_batch_size,
- }
- for k in outputs:
- new_outputs[k] = (outputs[k] * batch_size + part_outputs[k] * part_batch_size) / new_outputs["batch_size"]
- return new_outputs
-
- def get_metrics(self, outputs):
- """
- Get metrics.
- """
- if outputs is None:
- raise ValueError("metrics is None")
- outputs = dict(outputs)
- # pop statistics
- outputs.pop("batch_size", None)
- return outputs
-
- def get_data_loader(self, model, *args, is_infer=False, **kwargs):
- generator = self.reader.data_generator(*args, is_infer=is_infer, **kwargs)
- return model.get_data_loader(generator, is_infer)
diff --git a/modules/text/text_generation/plato2_en_large/utils/__init__.py b/modules/text/text_generation/plato2_en_large/utils/__init__.py
index 4068cbd6..8865dcbb 100644
--- a/modules/text/text_generation/plato2_en_large/utils/__init__.py
+++ b/modules/text/text_generation/plato2_en_large/utils/__init__.py
@@ -14,160 +14,45 @@
"""Utils."""
from itertools import chain
-import os
-import time
-import sys
-
import numpy as np
-import paddle.fluid as fluid
+import paddle
-def to_lodtensor(data, place):
- """Convert data to LoDTensor."""
- if place is None:
- return data
- lengths = []
- while isinstance(data[0], list):
- lengths.append(list(map(len, data)))
- data = [x for xs in data for x in xs]
- if isinstance(data[0], float):
- data = np.array(data, dtype="float32")
+def repeat_array(array, times):
+ """Repeate numpy array."""
+ if isinstance(array, list):
+ return list(chain(*([array] * times)))
else:
- data = np.array(data, dtype="int64")
- data_tensor = fluid.LoDTensor()
- data_tensor.set(data, place)
- data_tensor.set_recursive_sequence_lengths(lengths)
- return data_tensor
+ return np.concatenate([array] * times, axis=0)
+
+
+def gen_inputs(inputs, latent_type_size):
+ batch_size = len(inputs["data_id"])
+ new_bsz = batch_size * latent_type_size
+ inputs = {
+ name: repeat_array(array, latent_type_size)
+ for name, array in inputs.items()
+ }
+ # Add latent_id
+ inputs["latent_id"] = np.array(
+ [i for i in range(latent_type_size) for _ in range(batch_size)],
+ dtype="int64").reshape([-1, 1])
+
+ #print('\nplato_inputs:')
+ for key in inputs:
+ inputs[key] = paddle.to_tensor(inputs[key])
+ if key in [
+ 'token_ids', 'type_ids', 'pos_ids', 'tgt_ids', 'tgt_pos',
+ 'data_id'
+ ]:
+ inputs[key] = paddle.squeeze(inputs[key], axis=-1)
+ #print(key, inputs[key].shape, inputs[key].dtype)
+ return inputs
def pad_batch_data(insts, pad_id=0):
"""Pad the instances to the max sequence length in batch. """
max_len = max(map(len, insts))
- inst_data = np.array([list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts])
+ inst_data = np.array(
+ [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts])
return inst_data.astype("int64").reshape([-1, max_len, 1])
-
-
-def convert_lodtensor_to_list(tensor):
- data = np.array(tensor)
- recursive_sequence_lengths = tensor.recursive_sequence_lengths()
- recursive_sequence_lengths.reverse()
- for i, lengths in enumerate(recursive_sequence_lengths):
- shift = 0
- new_data = []
- for j, l in enumerate(lengths):
- new_data.append(data[shift:shift + l])
- shift += l
- data = new_data
- return data
-
-
-def concatenate_lodtensors(tensors, place):
- """Concatenate LoD tensors."""
- data = []
- recursive_sequence_lengths = []
- for tensor in tensors:
- data.append(np.array(tensor))
- recursive_sequence_lengths.append(tensor.recursive_sequence_lengths())
- data = np.concatenate(data, axis=0)
- recursive_sequence_lengths = [sum(lens, []) for lens in zip(*recursive_sequence_lengths)]
- data_tensor = fluid.LoDTensor()
- data_tensor.set(data, place)
- data_tensor.set_recursive_sequence_lengths(recursive_sequence_lengths)
- assert data_tensor.has_valid_recursive_sequence_lengths()
- return data_tensor
-
-
-def repeat_array_or_tensor(array_or_tensor, place, times):
- """Repeate numpy array or LoD tensor."""
- if isinstance(array_or_tensor, fluid.LoDTensor):
- data = [np.array(array_or_tensor)] * times
- recursive_sequence_lengths = [array_or_tensor.recursive_sequence_lengths()] * times
- data = np.concatenate(data, axis=0)
- recursive_sequence_lengths = [sum(lens, []) for lens in zip(*recursive_sequence_lengths)]
- data_tensor = fluid.LoDTensor()
- data_tensor.set(data, place)
- data_tensor.set_recursive_sequence_lengths(recursive_sequence_lengths)
- assert data_tensor.has_valid_recursive_sequence_lengths()
- return data_tensor
- elif isinstance(array_or_tensor, list):
- return list(chain(*([array_or_tensor] * times)))
- else:
- return np.concatenate([array_or_tensor] * times, axis=0)
-
-
-def slice_array_or_tensor(array_or_tensor, place, begin, end):
- """Repeate numpy array or LoD tensor."""
- if isinstance(array_or_tensor, fluid.LoDTensor):
- data = convert_lodtensor_to_list(array_or_tensor)
- data = data[begin:end]
- return to_lodtensor(data, place)
- else:
- return array_or_tensor[begin:end]
-
-
-def init_checkpoint(exe, init_checkpoint_path, main_program):
- """Initialize from checkpoint."""
- assert os.path.exists(init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
-
- def existed_persitables(var):
- """Whether var is a persistables."""
- if not fluid.io.is_persistable(var):
- return False
- return os.path.exists(os.path.join(init_checkpoint_path, var.name))
-
- fluid.io.load_vars(exe, init_checkpoint_path, main_program=main_program, predicate=existed_persitables)
- print(f"Load model from {init_checkpoint_path}")
-
-
-def init_pretraining_params(exe, pretraining_params_path, main_program):
- """Only initialize parameters."""
- assert os.path.exists(pretraining_params_path), "[%s] cann't be found." % pretraining_params_path
-
- def existed_params(var):
- """Whether var is a parameter."""
- if not isinstance(var, fluid.framework.Parameter):
- return False
- return os.path.exists(os.path.join(pretraining_params_path, var.name))
-
- fluid.io.load_vars(exe, pretraining_params_path, main_program=main_program, predicate=existed_params)
- print(f"Load pretraining parameters from {pretraining_params_path}.")
-
- return
-
-
-class Timer(object):
- def __init__(self):
- self._pass_time = 0
- self._start_time = None
- return
-
- def start(self):
- self._start_time = time.time()
-
- def pause(self):
- self._pass_time += time.time() - self._start_time
- self._start_time = None
-
- def reset(self):
- self._pass_time = 0
-
- @property
- def pass_time(self):
- if self._start_time is None:
- return self._pass_time
- else:
- return self._pass_time + time.time() - self._start_time
-
-
-ERROR_MESSAGE = "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
- Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
-
-
-def check_cuda(use_cuda, err=ERROR_MESSAGE):
- """Check CUDA."""
- try:
- if use_cuda and not fluid.is_compiled_with_cuda():
- print(err)
- sys.exit(1)
- except Exception as e:
- pass
diff --git a/modules/text/text_generation/plato2_en_large/utils/args.py b/modules/text/text_generation/plato2_en_large/utils/args.py
index 900c6bf2..514151ec 100644
--- a/modules/text/text_generation/plato2_en_large/utils/args.py
+++ b/modules/text/text_generation/plato2_en_large/utils/args.py
@@ -15,9 +15,6 @@
import argparse
import json
-import sys
-
-import paddle.fluid as fluid
def str2bool(v):
diff --git a/modules/text/text_generation/plato2_en_large/utils/inference.py b/modules/text/text_generation/plato2_en_large/utils/inference.py
deleted file mode 100644
index f21b0162..00000000
--- a/modules/text/text_generation/plato2_en_large/utils/inference.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference utils."""
-
-import os
-
-import paddle.fluid as fluid
-
-
-def create_predictor(inference_model_path, is_distributed=False):
- """Create predictor."""
- if is_distributed:
- dev_count = fluid.core.get_cuda_device_count()
- gpu_id = int(os.getenv("FLAGS_selected_gpus"))
- else:
- dev_count = 1
- gpu_id = 0
-
- place = fluid.CUDAPlace(gpu_id)
- exe = fluid.Executor(place)
-
- scope = fluid.Scope()
- with fluid.scope_guard(scope):
- inference_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(inference_model_path, exe)
-
- def __predict__(inputs):
- with fluid.scope_guard(scope):
- outputs = exe.run(inference_prog, feed=inputs, fetch_list=fetch_targets, return_numpy=True)
- return outputs
-
- return __predict__
diff --git a/modules/text/text_generation/plato2_en_large/utils/masking.py b/modules/text/text_generation/plato2_en_large/utils/masking.py
index c9c40bec..9f0eb2d8 100644
--- a/modules/text/text_generation/plato2_en_large/utils/masking.py
+++ b/modules/text/text_generation/plato2_en_large/utils/masking.py
@@ -15,8 +15,6 @@
import numpy as np
-import plato2_en_large.utils
-
def mask(batch_tokens,
vocab_size,
@@ -47,14 +45,20 @@ def mask(batch_tokens,
else:
shift_len = 0
for sent_index, sent in enumerate(batch_tokens):
- sent_b_index = sent_b_starts[sent_index] if sent_b_starts is not None else 0
+ sent_b_index = sent_b_starts[
+ sent_index] if sent_b_starts is not None else 0
need_cal = True
if labels is not None:
- label_pos.append(sent_index * max_len + len(sent) - 1 + shift_len)
+ label_pos.append(sent_index * max_len + len(sent) - 1 +
+ shift_len)
if labels[sent_index] == 0:
need_cal = False
mask_label.extend(sent[sent_b_index + 1:])
- mask_pos.extend([sent_index * max_len + i + shift_len for i in range(sent_b_index, len(sent) - 1)])
+ mask_pos.extend([
+ sent_index * max_len + i + shift_len
+ for i in range(sent_b_index,
+ len(sent) - 1)
+ ])
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return_list = [mask_label, mask_pos]
@@ -64,14 +68,21 @@ def mask(batch_tokens,
bow_label = []
bow_pos = []
for sent_index, sent in enumerate(batch_tokens):
- sent_b_index = sent_b_starts[sent_index] if sent_b_starts is not None else 0
+ sent_b_index = sent_b_starts[
+ sent_index] if sent_b_starts is not None else 0
def __filter__(tok_id):
# TODO: exclude [EOS] from bow loss
return True
- bow_pos.extend([sent_index for i in range(sent_b_index + 1, len(sent)) if __filter__(sent[i])])
- bow_label.extend([sent[i] for i in range(sent_b_index + 1, len(sent)) if __filter__(sent[i])])
+ bow_pos.extend([
+ sent_index for i in range(sent_b_index + 1, len(sent))
+ if __filter__(sent[i])
+ ])
+ bow_label.extend([
+ sent[i] for i in range(sent_b_index + 1, len(sent))
+ if __filter__(sent[i])
+ ])
bow_label = np.array(bow_label).astype("int64").reshape([-1, 1])
bow_pos = np.array(bow_pos).astype("int64").reshape([-1, 1])
return_list += [bow_label, bow_pos]
@@ -80,7 +91,9 @@ def mask(batch_tokens,
total_token_num = sum(map(len, batch_tokens))
prob_mask = np.random.rand(total_token_num)
# TODO: fix replace_ids, include [UNK]
- replace_ids = np.random.randint(3, high=vocab_size, size=total_token_num)
+ replace_ids = np.random.randint(3,
+ high=vocab_size,
+ size=total_token_num)
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
# add pair label position
diff --git a/modules/text/text_generation/plato2_en_large/utils/tokenization.py b/modules/text/text_generation/plato2_en_large/utils/tokenization.py
index 216ef86b..d2ad33a0 100644
--- a/modules/text/text_generation/plato2_en_large/utils/tokenization.py
+++ b/modules/text/text_generation/plato2_en_large/utils/tokenization.py
@@ -14,14 +14,10 @@
"""Tokenization classes."""
import collections
-import json
import sentencepiece as spm
-import six
import unicodedata
-from plato2_en_large.utils.args import str2bool
-
-SPIECE_UNDERLINE = u"▁".encode("utf-8")
+from .args import str2bool
def clean_text(text):
@@ -79,15 +75,18 @@ def encode_ids(spm_model, text, sample=False):
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
- if isinstance(text, six.binary_type):
+ if isinstance(text, str):
+ return text
+ elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
- return text
+ else:
+ raise ValueError("Unsupported string type: %s" % (type(text)))
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
- fin = open(vocab_file)
+ fin = open(vocab_file, 'r', encoding="UTF-8")
for num, line in enumerate(fin):
items = convert_to_unicode(line.rstrip()).split("\t")
if len(items) > 2:
--
GitLab