From 524a75122f9d2585e1a74d039cd9cc1ecb70a3e9 Mon Sep 17 00:00:00 2001 From: jm12138 <2286040843@qq.com> Date: Thu, 29 Dec 2022 10:08:23 +0800 Subject: [PATCH] update plato2_en_large (#2114) * update plato2_en_large * update README --- .../text_generation/plato2_en_large/README.md | 196 ++++---- .../plato2_en_large/__init__.py | 0 .../text_generation/plato2_en_large/model.py | 451 ++++++++++++++++++ .../plato2_en_large/models/__init__.py | 65 --- .../plato2_en_large/models/generator.py | 268 ----------- .../plato2_en_large/models/model_base.py | 288 ----------- .../plato2_en_large/models/nsp_model.py | 107 ----- .../plato2_en_large/models/optimizer.py | 37 -- .../plato2_en_large/models/plato.py | 241 ---------- .../models/transformer_block.py | 332 ------------- .../models/unified_transformer.py | 378 --------------- .../text_generation/plato2_en_large/module.py | 125 ++--- .../plato2_en_large/readers/dialog_reader.py | 130 +++-- .../plato2_en_large/readers/nsp_reader.py | 74 ++- .../plato2_en_large/readers/plato_reader.py | 36 +- .../plato2_en_large/tasks/__init__.py | 61 --- .../tasks/dialog_generation.py | 292 ------------ .../tasks/next_sentence_prediction.py | 44 -- .../plato2_en_large/tasks/task_base.py | 86 ---- .../plato2_en_large/utils/__init__.py | 179 ++----- .../plato2_en_large/utils/args.py | 3 - .../plato2_en_large/utils/inference.py | 42 -- .../plato2_en_large/utils/masking.py | 31 +- .../plato2_en_large/utils/tokenization.py | 15 +- 24 files changed, 826 insertions(+), 2655 deletions(-) delete mode 100644 modules/text/text_generation/plato2_en_large/__init__.py create mode 100644 modules/text/text_generation/plato2_en_large/model.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/__init__.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/generator.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/model_base.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/nsp_model.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/optimizer.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/plato.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/transformer_block.py delete mode 100644 modules/text/text_generation/plato2_en_large/models/unified_transformer.py delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/__init__.py delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py delete mode 100644 modules/text/text_generation/plato2_en_large/tasks/task_base.py delete mode 100644 modules/text/text_generation/plato2_en_large/utils/inference.py diff --git a/modules/text/text_generation/plato2_en_large/README.md b/modules/text/text_generation/plato2_en_large/README.md index 892212ba..529024cd 100644 --- a/modules/text/text_generation/plato2_en_large/README.md +++ b/modules/text/text_generation/plato2_en_large/README.md @@ -1,154 +1,122 @@ -## 概述 +# plato2_en_large -PLATO2是一个超大规模生成式对话系统模型。它承袭了PLATO隐变量进行回复多样化生成的特性,能够就开放域话题进行流畅深入的聊天。据公开数据,其效果超越了Google 于2020年2月份发布的 Meena和Facebook AI Research于2020年4月份发布的Blender的效果。plato2_en_large包含1.6B参数,可用于一键预测对话回复,该Module仅支持使用GPU预测,不支持CPU。 -

-
-

+| 模型名称 | plato2_en_large | +| :------------------ | :--------------------: | +| 类别 | 文本-文本生成 | +| 网络 | PLATO2 | +| 数据集 | 大规模开放域英文数据集 | +| 是否支持Fine-tuning | 否 | +| 模型大小 | 19.3 GB | +| 最新更新日期 | 2022-11-05 | +| 数据指标 | - | -更多详情参考论文[PLATO-2: Towards Building an Open-Domain Chatbot via Curriculum Learning](https://arxiv.org/abs/2006.16779) +## 一、模型基本信息 -**注:plato2\_en\_large 模型大小12GB,下载时间较长,请耐心等候。运行此模型要求显存至少16GB。** +- ### 模型介绍 + - PLATO2 是一个超大规模生成式对话系统模型。它承袭了 PLATO 隐变量进行回复多样化生成的特性,能够就开放域话题进行流畅深入的聊天。据公开数据,其效果超越了 Google 于 2020 年 2 月份发布的 Meena 和 Facebook AI Research 于2020 年 4 月份发布的 Blender 的效果。plato2_en_large 包含 1.6B 参数,可用于一键预测对话回复。由于该 Module 参数量较多,推荐使用GPU预测。 -## 命令行预测 -```shell -$ hub run plato2_en_large --input_text="Hello, how are you" -``` +## 二、安装 -## API +- ### 1、环境依赖 -```python -def generate(texts): -``` + - paddlepaddle >= 2.0.0 + - paddlehub >= 2.1.0 | [如何安装PaddleHub](../../../../docs/docs_ch/get_start/installation.rst) -预测API,输入对话上下文,输出机器回复。 +- ### 2、安装 -**参数** + - ```shell + $ hub install plato2_en_large + ``` + - 如您安装时遇到问题,可参考:[零基础windows安装](../../../../docs/docs_ch/get_start/windows_quickstart.md) + | [零基础Linux安装](../../../../docs/docs_ch/get_start/linux_quickstart.md) | [零基础MacOS安装](../../../../docs/docs_ch/get_start/mac_quickstart.md) -* texts (list\[str\] or str): 如果不在交互模式中,texts应为list,每个元素为一次对话的上下文,上下文应包含人类和机器人的对话内容,不同角色之间的聊天用分隔符"\t"进行分割;例如[["Hello\thi, nice to meet you\tnice to meet you"]]。这个输入中包含1次对话,机器人回复了"hi, nice to meet you"后人类回复“nice to meet you”,现在轮到机器人回复了。如果在交互模式中,texts应为str,模型将自动构建它的上下文。 +## 三、模型API预测 -**返回** +- ### 1、命令行预测 -* results (list\[str\]): 每个元素为相应对话中机器人的新回复。 + - ```bash + $ hub run plato2_en_large --input_text="Hello, how are you" + ``` -**代码示例** +- ### 2、预测代码示例 -```python -import paddlehub as hub + - ```python + import paddlehub as hub -module = hub.Module(name="plato2_en_large") + module = hub.Module(name="plato2_en_large") -test_texts = ["Hello","Hello\thi, nice to meet you\tnice to meet you"] -results = module.generate(texts=test_texts) -for result in results: - print(result) -``` + test_texts = ["Hello","Hello\thi, nice to meet you\tnice to meet you"] + results = module.generate(texts=test_texts) + for result in results: + print(result) + ``` -```python -def interactive_mode(max_turn =6): -``` +- ### 3、API -进入交互模式。交互模式中,generate接口的texts将支持字符串类型。 + - ```python + def generate(texts): + ``` -**参数** + - 预测API,输入对话上下文,输出机器回复。 + - **参数** + - texts (list\[str\] or str): 如果不在交互模式中,texts应为list,每个元素为一次对话的上下文,上下文应包含人类和机器人的对话内容,不同角色之间的聊天用分隔符"\t"进行分割;例如[["Hello\thi, nice to meet you\tnice to meet you"]]。这个输入中包含1次对话,机器人回复了"hi, nice to meet you"后人类回复“nice to meet you”,现在轮到机器人回复了。如果在交互模式中,texts应为str,模型将自动构建它的上下文。 -* max_turn (int): 模型能记忆的对话轮次,当max_turn = 1时,模型只能记住当前对话,无法获知之前的对话内容。 + - ```python + def interactive_mode(max_turn=6): + ``` -**代码示例** + - 进入交互模式。交互模式中,generate接口的texts将支持字符串类型。 + - **参数** + - max_turn (int): 模型能记忆的对话轮次,当max_turn = 1时,模型只能记住当前对话,无法获知之前的对话内容。 -```python -import paddlehub as hub -module = hub.Module(name="plato2_en_large") +## 四、服务部署 -with module.interactive_mode(max_turn=6): - while True: - human_utterance = input("[Human]: ").strip() - robot_utterance = module.generate(human_utterance) - print("[Bot]: %s"%robot_utterance[0]) -``` +- PaddleHub Serving可以部署一个在线对话机器人服务。 -## 服务部署 +- ### 第一步:启动PaddleHub Serving -PaddleHub Serving 可以部署在线服务。 + - 运行启动命令: + - ```shell + $ hub serving start -m plato2_en_large -p 8866 + ``` -### 第一步:启动PaddleHub Serving + - 这样就完成了一个对话机器人服务化API的部署,默认端口号为8866。 + - **NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 -运行启动命令: -```shell -$ hub serving start -m plato2_en_large -p 8866 -``` -这样就完成了一个服务化API的部署,默认端口号为8866。 +- ### 第二步:发送预测请求 -**NOTE:** 在启动服务之前,请设置CUDA\_VISIBLE\_DEVICES环境变量。 + - 配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 -### 第二步:发送预测请求 + - ```python + import requests + import json -方式1: 自定义脚本发送对话信息 + data = {'texts':["Hello","Hello\thi, nice to meet you\tnice to meet you"]} + headers = {"Content-type": "application/json"} + url = "http://127.0.0.1:8866/predict/plato2_en_large" + r = requests.post(url=url, headers=headers, data=json.dumps(data)) -配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + # 保存结果 + results = r.json()["results"] + for result in results: + print(result) + ``` -```python -import requests -import json + - 关于PaddleHub Serving更多信息参考[服务部署](../../../../docs/docs_ch/tutorial/serving.md) -# 发送HTTP请求 +## 五、更新历史 -data = {'texts':["Hello","Hello\thi, nice to meet you\tnice to meet you"]} -headers = {"Content-type": "application/json"} -url = "http://127.0.0.1:8866/predict/plato2_en_large" -r = requests.post(url=url, headers=headers, data=json.dumps(data)) - -# 保存结果 -results = r.json()["results"] -for result in results: - print(result) -``` - -方式2: 通过交互式客户端进入交互模式 - -您可以执行以下客户端脚本进入交互模式: - -```python -import requests -import json - -ADDR = "127.0.0.1" # Your serving address -PORT = 8866 # Your serving port -MAX_TURN = 6 - -headers = {"Content-type": "application/json"} -url = "http://%s:%s/predict/plato2_en_large" % (ADDR, PORT) - -context = [] -while True: - user_utt = input("[Human]: ").strip() - if user_utt == "[NEXT]": - context = "" - print("Restart") - else: - context.append(user_utt) - data = {'texts': ["\t".join(context[-MAX_TURN:])]} - r = requests.post(url=url, headers=headers, data=json.dumps(data)) - bot_response = r.json()["results"][0] - print("[Bot]: %s"%bot_response) - context.append(bot_response) -``` - -## 查看代码 - -https://github.com/PaddlePaddle/Knover - -### 依赖 - -1.8.2 <= paddlepaddle < 2.0.0 - -1.7.0 <= paddlehub < 2.0.0 +* 1.0.0 + 初始发布 -## 更新历史 +* 1.1.0 -* 1.0.0 + 移除 Fluid API - 初始发布 + - ```shell + $ hub install plato==1.1.0 + ``` diff --git a/modules/text/text_generation/plato2_en_large/__init__.py b/modules/text/text_generation/plato2_en_large/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/modules/text/text_generation/plato2_en_large/model.py b/modules/text/text_generation/plato2_en_large/model.py new file mode 100644 index 00000000..0b322a7a --- /dev/null +++ b/modules/text/text_generation/plato2_en_large/model.py @@ -0,0 +1,451 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import namedtuple + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def post_process_context(token_ids, reader, merge=True): + """Post-process the context sequence.""" + context = [] + utt = [] + for tok_id in token_ids[1:]: + if tok_id == reader.eos_id: + utt = reader.tokenizer.convert_ids_to_tokens(utt) + if merge: + utt = reader.tokenizer.merge_subword(utt) + context.append(utt) + utt = [] + else: + utt.append(tok_id) + return context + + +def post_process_response(token_ids, reader, merge=True): + """ + Post-process the decoded sequence. Truncate from the first + and remove the and tokens currently. + """ + eos_pos = len(token_ids) + for i, tok_id in enumerate(token_ids): + if tok_id == reader.eos_id: + eos_pos = i + break + token_ids = token_ids[1:eos_pos] + response = reader.tokenizer.convert_ids_to_tokens(token_ids) + if merge: + response = reader.tokenizer.merge_subword(response) + return token_ids, response + + +def get_cross_turn_repetition(context, pred_tokens, eos_idx, is_cn=False): + """Get cross-turn repetition.""" + if len(pred_tokens) == 0: + return 1.0 + if is_cn: + context = ["".join(utt) for utt in context] + pred_tokens = "".join(pred_tokens) + + pred_tri_grams = set() + for i in range(len(pred_tokens) - 2): + tri_gram = tuple(pred_tokens[i:i + 3]) + pred_tri_grams.add(tri_gram) + for utt in context: + for i in range(len(utt) - 2): + tri_gram = tuple(utt[i:i + 3]) + if tri_gram in pred_tri_grams: + return 1.0 + return 0.0 + + +def get_in_turn_repetition(pred, is_cn=False): + """Get in-turn repetition.""" + if len(pred) == 0: + return 1.0 + if isinstance(pred[0], str): + pred = [tok.lower() for tok in pred] + if is_cn: + pred = "".join(pred) + tri_grams = set() + for i in range(len(pred) - 2): + tri_gram = tuple(pred[i:i + 3]) + if tri_gram in tri_grams: + return 1.0 + tri_grams.add(tri_gram) + return 0.0 + + +class Plato2EncoderLayer(nn.Layer): + + def __init__(self, n_head, hidden_size, attn_dropout, act_dropout): + super(Plato2EncoderLayer, self).__init__() + + self.self_attn = nn.MultiHeadAttention(hidden_size, n_head, attn_dropout) + self.pre_norm_layer = nn.LayerNorm(hidden_size) + self.post_norm_layer = nn.LayerNorm(hidden_size) + self.fc1 = nn.Linear(hidden_size, hidden_size * 4) + self.fc2 = nn.Linear(hidden_size * 4, hidden_size) + + self.dropout_layer = nn.Dropout(act_dropout) + self.gelu_layer = nn.GELU() + + def forward(self, x, attn_mask, cache): + query = self.pre_norm_layer(x) + attn_output, new_cache = self.self_attn(query, None, None, attn_mask, cache) + attn_output = self.dropout_layer(attn_output) + attn_output = attn_output + x + ffd_input = self.post_norm_layer(attn_output) + + ffd_output = self.fc1(ffd_input) + ffd_output = self.gelu_layer(ffd_output) + ffd_output = self.dropout_layer(ffd_output) + + ffd_output = self.fc2(ffd_output) + ffd_output = self.dropout_layer(ffd_output) + out = ffd_output + attn_output + + return out, new_cache + + def gen_cache(self, key): + return self.self_attn.gen_cache(key) + + +class Plato2Encoder(nn.Layer): + + def __init__(self, vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, attn_dropout, + act_dropout): + super(Plato2Encoder, self).__init__() + + self.n_head = n_head + + self.word_embedding_layer = nn.Embedding(vocab_size, hidden_size) + self.sent_embedding_layer = nn.Embedding(type_size, hidden_size) + self.pos_embedding_layer = nn.Embedding(max_position_seq_len, hidden_size) + + self.encoder_layers = [] + for i in range(num_layers): + encoder_layer = Plato2EncoderLayer(n_head, hidden_size, attn_dropout, act_dropout) + self.encoder_layers.append(encoder_layer) + self.add_sublayer('layers.' + str(i), encoder_layer) + self.post_encoder_layer_norm = nn.LayerNorm(hidden_size) + + self.dropout_layer = nn.Dropout(act_dropout) + + def forward(self, caches, token_ids, type_ids, pos_ids, generation_mask, aux_emb=None): + out, self_attn_mask = self.gen_input(token_ids, type_ids, pos_ids, generation_mask, aux_emb) + + new_caches = [] + for i, encoder_layer in enumerate(self.encoder_layers): + out, new_cache = encoder_layer(out, self_attn_mask, caches[i]) + new_caches.append(new_cache) + + enc_output = self.post_encoder_layer_norm(out) + return enc_output, new_caches + + def gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None): + token_emb_out = self.word_embedding_layer(token_ids) + type_emb_out = self.sent_embedding_layer(type_ids) + pos_emb_out = self.pos_embedding_layer(pos_ids) + emb_out = token_emb_out + type_emb_out + pos_emb_out + + # auxiliary memory embeddings + if aux_emb is not None: + emb_out = paddle.concat([aux_emb, emb_out], axis=1) + + emb_out = self.dropout_layer(emb_out) + + # generate n-head self-attention mask + self_attn_mask = input_mask + self_attn_mask = paddle.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False) + n_head_self_attn_mask = paddle.stack(x=[self_attn_mask] * self.n_head, axis=1) + n_head_self_attn_mask.stop_gradient = True + + return emb_out, n_head_self_attn_mask + + def gen_caches(self, key): + caches = [encoder_layer.gen_cache(key) for encoder_layer in self.encoder_layers] + return caches + + +class NSP(nn.Layer): + + def __init__(self, vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, attn_dropout, + act_dropout): + super(NSP, self).__init__() + + self.n_head = n_head + self.hidden_size = hidden_size + + self.word_embedding_layer = nn.Embedding(vocab_size, hidden_size) + self.sent_embedding_layer = nn.Embedding(type_size, hidden_size) + self.pos_embedding_layer = nn.Embedding(max_position_seq_len, hidden_size) + + encoder_layer = nn.TransformerEncoderLayer(hidden_size, n_head, hidden_size * 4, act_dropout, 'gelu', + attn_dropout, act_dropout, 'True') + encoder_norm = nn.LayerNorm(hidden_size) + self.encoder = nn.TransformerEncoder(encoder_layer, num_layers, encoder_norm) + self.fc1 = nn.Linear(hidden_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, 2) + + self.dropout_layer = nn.Dropout(act_dropout) + self.tanh_layer = nn.Tanh() + self.softmax = nn.Softmax() + + def forward(self, inputs): + token_ids = inputs['token_ids'] + type_ids = inputs['type_ids'] + pos_ids = inputs['pos_ids'] + attention_mask = inputs['attention_mask'] + label_pos = inputs["label_pos"] + + out, self_attn_mask = self.gen_input(token_ids, type_ids, pos_ids, attention_mask) + # [-1, seq_len, hidden_size] + enc_out = self.encoder(out, self_attn_mask) + + enc_out = paddle.reshape(enc_out, [-1, self.hidden_size]) + label_pos = paddle.cast(label_pos, 'int64') + out = paddle.gather(enc_out, label_pos) + pooled_out = self.fc1(out) + pooled_out = self.tanh_layer(pooled_out) + + # [-1, 2] + logits = self.fc2(pooled_out) + probs = self.softmax(logits) + + return probs + + def gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None): + token_emb_out = self.word_embedding_layer(token_ids) + type_emb_out = self.sent_embedding_layer(type_ids) + pos_emb_out = self.pos_embedding_layer(pos_ids) + emb_out = token_emb_out + type_emb_out + pos_emb_out + + # auxiliary memory embeddings + if aux_emb is not None: + emb_out = paddle.concat([aux_emb, emb_out], axis=1) + + emb_out = self.dropout_layer(emb_out) + + # generate n-head self-attention mask + self_attn_mask = input_mask + self_attn_mask = paddle.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False) + n_head_self_attn_mask = paddle.stack(x=[self_attn_mask] * self.n_head, axis=1) + n_head_self_attn_mask.stop_gradient = True + + return emb_out, n_head_self_attn_mask + + +class Plato2InferModel(nn.Layer): + + def __init__(self, + nsp_reader, + num_layers, + n_head, + hidden_size, + vocab_size=8001, + type_size=2, + latent_type_size=20, + max_position_seq_len=256, + act_dropout=0.1, + attn_dropout=0.1, + max_dec_len=64, + min_dec_len=1, + topk=10): + super(Plato2InferModel, self).__init__() + + self.nsp_reader = nsp_reader + self.num_layers = num_layers + self.latent_type_size = latent_type_size + self.max_dec_len = max_dec_len + self.min_dec_len = min_dec_len + self.topk = topk + self.unk_id = 0 + self.bos_id = 1 + self.eos_id = 2 + self.mask_id = 8000 + self.after_eos = paddle.ones([vocab_size]) * -1e9 + self.after_eos[self.eos_id] = 0 + self.is_cn = False + self.batch_size = 1 + + self.latent_weight = paddle.create_parameter([hidden_size, latent_type_size], 'float32') + + self.plato2_encoder = Plato2Encoder(vocab_size, type_size, max_position_seq_len, num_layers, n_head, + hidden_size, attn_dropout, act_dropout) + + self.logits_fc_layer = nn.Linear(hidden_size, hidden_size) + self.logits_layer_norm = nn.LayerNorm(hidden_size) + self.logits_bias = paddle.create_parameter([vocab_size], 'float32', is_bias=True) + + self.nsp_predictor = NSP(vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, + attn_dropout, act_dropout) + + self.gelu_layer = nn.GELU() + self.softmax = nn.Softmax() + + @paddle.no_grad() + def forward(self, inputs): + token_ids = inputs['token_ids'] + type_ids = inputs['type_ids'] + pos_ids = inputs['pos_ids'] + generation_mask = inputs['generation_mask'] + latent_id = inputs['latent_id'] + data_id = inputs['data_id'] + + # [-1, 1, latent_type_size] + latent_id = F.one_hot(latent_id, self.latent_type_size) + # [-1, 1, hidden_size] + latent_emb = paddle.matmul(latent_id, self.latent_weight, transpose_y=True) + + caches = self.plato2_encoder.gen_caches(token_ids) + + # [-1, seq_len + 1, hidden_size] + enc_out, new_caches = self.plato2_encoder(caches, token_ids, type_ids, pos_ids, generation_mask, latent_emb) + + pred_ids = self.decode(inputs, new_caches) + + nsp_inputs = self.gen_nsp_input(token_ids, pred_ids) + # [-1, 2] + probs = self.nsp_predictor(nsp_inputs) + + return self.get_results(data_id, token_ids, pred_ids, probs) + + def decode(self, inputs, caches): + tgt_ids = inputs['tgt_ids'] + tgt_pos = inputs['tgt_pos'] + tgt_generation_mask = inputs['tgt_generation_mask'] + predictions = tgt_ids + + # TODO + step = 0 + while step < self.max_dec_len: + # [-1, 1] + append_mask = paddle.cast(tgt_ids != self.eos_id, dtype=tgt_generation_mask.dtype) + tgt_generation_mask = paddle.concat([tgt_generation_mask, paddle.unsqueeze(append_mask, 1)], axis=-1) + tgt_sent = paddle.ones([tgt_generation_mask.shape[0], 1], dtype=tgt_ids.dtype) + + # [-1, 1, hidden_size] + out, caches = self.plato2_encoder(caches, tgt_ids, tgt_sent, tgt_pos, tgt_generation_mask) + out = paddle.squeeze(out, axis=1) + + # [-1, hidden_size] + trans = self.logits_fc_layer(out) + trans = self.gelu_layer(trans) + trans = self.logits_layer_norm(trans) + + # [-1, vocab_size] + logits = paddle.matmul(trans, self.plato2_encoder.word_embedding_layer.weight, + transpose_y=True) + self.logits_bias + logits[:, self.unk_id] = -1e9 + logits[:, self.bos_id] = -1e9 + logits[:, self.mask_id] = -1e9 + if step < self.min_dec_len: + logits[:, self.eos_id] = -1e9 + logits = logits * append_mask + (1 - append_mask) * self.after_eos + probs = self.softmax(logits) + + # [-1, topk] + topk_probs, _ = paddle.topk(probs, k=self.topk) + mask = paddle.cast(probs >= topk_probs[:, -1:], 'float32') + sums = paddle.sum(topk_probs, axis=-1, keepdim=True) + new_probs = probs * mask / sums + # [-1, 1] + sampling_ids = paddle.multinomial(new_probs) + + step = step + 1 + tgt_ids = sampling_ids + tgt_pos = tgt_pos + 1 + predictions = paddle.concat([predictions, tgt_ids], axis=1) + return predictions + + def gen_nsp_input(self, token_ids, pred_ids): + token_ids = token_ids.numpy() + pred_ids = pred_ids.numpy() + + def __reader__(): + headers = ["src", "tgt", "data_id"] + + Example = namedtuple("Example", headers) + + for i, (raw, pred) in enumerate(zip(token_ids, pred_ids)): + context = post_process_context(raw, self.nsp_reader, merge=False) + _, response = post_process_response(pred, self.nsp_reader, merge=False) + context_tokenized_input = " [SEP] ".join(" ".join(utt) for utt in context) + response_tokenized_input = " ".join(response) + example = Example(src=context_tokenized_input, tgt=response_tokenized_input, data_id=i) + data = self.nsp_reader._convert_example_to_record(example, is_infer=True) + yield data + return + + generator = self.nsp_reader.data_generator( + reader=__reader__, + is_infer=True, + phase="test", + ) + inputs = next(generator()) + + #print('\nnsp_inputs:') + for key in inputs: + inputs[key] = paddle.to_tensor(inputs[key]) + if key in ['token_ids', 'type_ids', 'pos_ids']: + inputs[key] = paddle.squeeze(inputs[key], axis=-1) + #print(key, inputs[key].shape) + #print(inputs[key]) + return inputs + + def get_results(self, data_id, token_ids, pred_ids, probs): + data_id = data_id.numpy() + token_ids = token_ids.numpy() + pred_ids = pred_ids.numpy() + probs = probs.numpy() + + infos = [] + for raw, pred, prob in zip(token_ids, pred_ids, probs): + tokens = post_process_context(raw, self.nsp_reader) + pred_token_ids, pred_tokens = post_process_response(pred, self.nsp_reader) + info = {} + info['response'] = ' '.join(pred_tokens) + cross_turn_repetition = get_cross_turn_repetition(tokens, pred_tokens, self.nsp_reader.eos_id, self.is_cn) + in_turn_repetition = max(get_in_turn_repetition(pred_tokens, self.is_cn), + get_in_turn_repetition(pred_token_ids)) + + info['score'] = float(prob[1]) + if len(pred_token_ids) >= self.max_dec_len: + info['score'] -= 1e3 + elif cross_turn_repetition > 0: + info['score'] -= 1e3 + elif in_turn_repetition > 0: + info['score'] -= 1e3 + infos.append(info) + + results = [] + pre_idx = 0 + sample = [] + for idx, info in zip(data_id, infos): + if idx != pre_idx: + sample = sorted(sample, key=lambda info: -info["score"]) + result = sample[0] + result['data_id'] = pre_idx + results.apeend(result) + sample = [] + pre_idx = idx + sample.append(info) + if sample: + sample = sorted(sample, key=lambda info: -info["score"]) + result = sample[0] + result['data_id'] = pre_idx + results.append(result) + return results diff --git a/modules/text/text_generation/plato2_en_large/models/__init__.py b/modules/text/text_generation/plato2_en_large/models/__init__.py deleted file mode 100644 index 353c85a2..00000000 --- a/modules/text/text_generation/plato2_en_large/models/__init__.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Define model.""" - -from plato2_en_large.models.model_base import Model - -MODEL_REGISTRY = {} - -__all__ = ["MODEL_REGISTRY", "register_model", "create_model", "add_cmdline_args"] - - -def register_model(name): - """ - Register a new model class. - """ - - def __wrapped__(cls): - if name in MODEL_REGISTRY: - raise ValueError(f"Cannot register duplicate model ({name})") - if not issubclass(cls, Model): - raise ValueError(f"Model ({name}: {cls.__name__}) must extend Model") - MODEL_REGISTRY[name] = cls - return cls - - return __wrapped__ - - -def create_model(args, place) -> Model: - """ - Create a model. - """ - return MODEL_REGISTRY[args.model](args, place) - - -def add_cmdline_args(parser): - """ Add cmdline argument of Model. """ - group = parser.add_argument_group("Model") - - # Model - group.add_argument("--model", type=str, required=True) - - # Config - group.add_argument("--config_path", type=str, required=True) - - # Model related. - args, _ = parser.parse_known_args() - if args.model not in MODEL_REGISTRY: - raise ValueError(f"Unknown model type: {args.model}") - MODEL_REGISTRY[args.model].add_cmdline_args(parser) - return group - - -import plato2_en_large.models.nsp_model -import plato2_en_large.models.plato diff --git a/modules/text/text_generation/plato2_en_large/models/generator.py b/modules/text/text_generation/plato2_en_large/models/generator.py deleted file mode 100644 index 0117f2f7..00000000 --- a/modules/text/text_generation/plato2_en_large/models/generator.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Generator class""" - -import numpy as np -import paddle.fluid.layers as layers - -from plato2_en_large.utils.args import str2bool - - -class Generator(object): - """ - Generator class - - Use generator in inference phase. - """ - - @classmethod - def add_cmdline_args(cls, parser): - """Add cmdline argurments.""" - group = parser.add_argument_group("Generator") - group.add_argument("--min_dec_len", type=int, default=1) - group.add_argument("--max_dec_len", type=int, default=64) - - group.add_argument( - "--decoding_strategy", - type=str, - default="topk_sampling", - choices=["beam_search", "topk_sampling", "topp_sampling"]) - group.add_argument("--temperature", type=float, default=1.) - group.add_argument("--ignore_unk", type=str2bool, default=True) - - # multi sampling - group.add_argument("--num_samples", type=int, default=None) - - # top-k sampling - group.add_argument("--topk", type=int, default=10) - - # top-p sampling - group.add_argument("--topp", type=float, default=0.9) - - # beam search - group.add_argument("--beam_size", type=int, default=10) - group.add_argument("--length_average", type=str2bool, default=True) - group.add_argument("--length_penalty", type=float, default=0.0) - - return group - - def __init__(self, args): - self.min_dec_len = args.min_dec_len - self.max_dec_len = args.max_dec_len - self.eos_id = args.eos_id - self.unk_id = args.unk_id - self.mask_id = args.mask_id - self.vocab_size = args.vocab_size - - # model related - - # basic settings - self.decoding_strategy = args.decoding_strategy - self.ignore_unk = args.ignore_unk - self.continuous_position = args.continuous_position - self.temperature = args.temperature - - # reranking - self.num_samples = args.num_samples - - # top-k sampling - self.topk = args.topk - - # top-p sampling - self.topp = args.topp - - # beam search - self.beam_size = args.beam_size - self.length_penalty = args.length_penalty - self.length_average = args.length_average - return - - def inference(self, model, inputs, outputs): - """ - Run inference. - - Args: - inputs(dict): Its key is input name(str) and its value is a Variable. - model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. - - Returns: - dict(str:Variable): Its key is output name(str) and its value is a Variable. - """ - # prepare while loop - max_len = layers.fill_constant(shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) - min_len = layers.fill_constant(shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) - step_idx = layers.fill_constant(shape=[1], dtype="int64", value=0, force_cpu=True) - - ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) - pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) - scores = layers.array_write(inputs["init_score"], step_idx) - tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) - parent_idx = inputs["parent_idx"] - - if self.decoding_strategy == "beam_search": - beam_size = self.beam_size - else: - beam_size = 1 - - eos_penalty = np.zeros(self.vocab_size, dtype="float32") - eos_penalty[self.eos_id] = -1e9 - eos_penalty = layers.assign(eos_penalty) - - token_penalty = np.zeros(self.vocab_size, dtype="float32") - token_penalty[self.unk_id] = -1e9 - if self.mask_id >= 0: - token_penalty[self.mask_id] = -1e9 - token_penalty = layers.assign(token_penalty) - - # start while loop - cond = layers.less_than(x=step_idx, y=max_len) - while_op = layers.While(cond) - with while_op.block(): - pre_ids = layers.array_read(array=ids, i=step_idx) - pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) - pre_scores = layers.array_read(array=scores, i=step_idx) - pos_bias = layers.array_read(array=pos_biases, i=step_idx) - pos_bias = layers.gather(input=pos_bias, index=parent_idx) - - tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) - dtype = tmp_tgt_generation_mask.dtype - - append_mask = layers.fill_constant_batch_size_like(input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) - tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2) - pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx) - - pre_sent = layers.fill_constant_batch_size_like( - input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) - - if self.continuous_position: - pre_pos = layers.elementwise_mul( - x=layers.fill_constant_batch_size_like( - input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), - y=step_idx, - axis=0) + pos_bias - else: - pre_pos = layers.elementwise_mul( - x=layers.fill_constant_batch_size_like( - input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), - y=step_idx, - axis=0) - - dec_out, _ = model._generation_network( - token_ids=pre_ids, - type_ids=pre_sent, - pos_ids=pre_pos, - generation_mask=tmp_tgt_generation_mask, - gather_idx=parent_idx) - logits = model._calc_logits(dec_out) - - # ignore unk and mask token - if self.ignore_unk: - logits = layers.elementwise_add(logits, token_penalty, axis=1) - - # min dec length - min_len_cond = layers.less_than(x=step_idx, y=min_len) - - def min_len_penalty(): - """Plus minimum length penalty.""" - return layers.elementwise_add(logits, eos_penalty, axis=1) - - def no_penalty(): - """No penalty.""" - return logits - - logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) - - # get probs - probs = layers.softmax(logits / self.temperature) - - if self.decoding_strategy == "beam_search": - topk_scores, topk_indices = layers.topk(input=probs, k=beam_size) - else: - if self.decoding_strategy.startswith("sampling"): - sampling_ids = layers.sampling_id(probs, dtype="int") - elif self.decoding_strategy.startswith("topk_sampling"): - topk_probs, _ = layers.topk(input=probs, k=self.topk) - ge_cond = layers.cast( - layers.greater_equal(probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") - old_probs = probs - probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True) - sampling_ids = layers.sampling_id(probs, dtype="int") - probs = old_probs - elif self.decoding_strategy.startswith("topp_sampling"): - sorted_probs, sorted_idx = layers.argsort(probs, descending=True) - cum_sorted_probs = layers.cumsum(sorted_probs, axis=1, exclusive=True) - lt_cond = layers.cast( - layers.less_than( - cum_sorted_probs, - layers.fill_constant_batch_size_like(cum_sorted_probs, cum_sorted_probs.shape, - cum_sorted_probs.dtype, self.topp)), "float32") - old_probs = probs - candidate_probs = sorted_probs * lt_cond - probs = candidate_probs / layers.reduce_sum(candidate_probs, dim=-1, keep_dim=True) - sampling_ids = layers.sampling_id(probs, dtype="int") - sampling_ids = layers.index_sample(sorted_idx, layers.unsqueeze(sampling_ids, [1])) - sampling_ids = layers.squeeze(sampling_ids, [1]) - probs = old_probs - else: - raise ValueError(self.decoding_strategy) - - sampling_scores = layers.one_hot(layers.unsqueeze(sampling_ids, [1]), probs.shape[1]) - sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3 - topk_scores, topk_indices = layers.topk(input=sampling_scores, k=1) - - pre_len = layers.cast(step_idx, "float32") - layers.increment(x=step_idx, value=1.0, in_place=True) - cur_len = layers.cast(step_idx, "float32") - - # update scores - if self.length_average: - accu_scores = layers.elementwise_add( - x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len - elif self.length_penalty > 0: - pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) - cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) - accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp - else: - accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) - topk_indices = layers.lod_reset(topk_indices, pre_ids) - accu_scores = layers.lod_reset(accu_scores, pre_ids) - selected_ids, selected_scores, gather_idx = layers.beam_search( - pre_ids=pre_ids, - pre_scores=pre_scores, - ids=topk_indices, - scores=accu_scores, - beam_size=beam_size, - end_id=self.eos_id, - return_parent_idx=True) - - layers.array_write(selected_ids, i=step_idx, array=ids) - layers.array_write(selected_scores, i=step_idx, array=scores) - layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) - layers.array_write(pos_bias, i=step_idx, array=pos_biases) - - layers.assign(gather_idx, parent_idx) - - length_cond = layers.less_than(x=step_idx, y=max_len) - finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) - layers.logical_and(x=length_cond, y=finish_cond, out=cond) - - finished_ids, finished_scores = layers.beam_search_decode(ids, scores, beam_size=beam_size, end_id=self.eos_id) - - predictions = { - "finished_ids": finished_ids, - "finished_scores": finished_scores, - "token_ids": inputs["token_ids"], - "data_id": inputs["data_id"] - } - return predictions diff --git a/modules/text/text_generation/plato2_en_large/models/model_base.py b/modules/text/text_generation/plato2_en_large/models/model_base.py deleted file mode 100644 index 6edcadda..00000000 --- a/modules/text/text_generation/plato2_en_large/models/model_base.py +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Model base.""" - -from abc import abstractmethod, ABC - -import paddle.fluid as fluid -from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy -import paddle.fluid.incubate.fleet.base.role_maker as role_maker -import paddle.fluid.layers as layers - -from plato2_en_large.models.optimizer import AdamW -from plato2_en_large.utils import init_pretraining_params, init_checkpoint, to_lodtensor -from plato2_en_large.utils.args import str2bool - - -class Model(ABC): - """ - Basic model wrapper for paddle. - """ - - @classmethod - def add_cmdline_args(cls, parser): - """Add cmdline argurments.""" - group = parser.add_argument_group("Model") - # Init checkpoint - group.add_argument("--init_checkpoint", type=str, default="") - group.add_argument("--init_pretraining_params", type=str, default="") - - # Optimizer - group.add_argument("-lr", "--learning_rate", type=float, default=1e-5, help="The learning rate for optimizer.") - group.add_argument("--warmup_steps", type=int, default=0, help="The warmup steps.") - group.add_argument("--weight_decay", type=float, default=0.0, help="The weight decay for optimizer.") - group.add_argument("--max_grad_norm", type=float, default=.1, help="The maximum norm of gradient.") - - group.add_argument("--use_recompute", type=str2bool, default=False) - group.add_argument("--use_amp", type=str2bool, default=False) - group.add_argument("--amp_loss_scaling", type=float, default=12800) - return group - - def __init__(self, args, place): - self.place = place - self.exe = fluid.Executor(place) - - self.init_checkpoint = args.init_checkpoint - self.init_pretraining_params = args.init_pretraining_params - - self.learning_rate = args.learning_rate - self.warmup_steps = args.warmup_steps - self.weight_decay = args.weight_decay - self.max_grad_norm = args.max_grad_norm - - self.is_distributed = args.is_distributed - self.use_recompute = args.use_recompute - self.use_amp = args.use_amp - self.amp_loss_scaling = args.amp_loss_scaling - self.run_infer = args.get("run_infer", False) - self.batch_size = args.get("batch_size", 1) - self._build_programs() - return - - def _build_programs(self): - """ - Build programs. - - Build train_program, eval_program and inference_program. Only use in static graph mode. - """ - if self.run_infer: - self.startup_program = fluid.Program() - # build infer program - self.infer_program = fluid.Program() - with fluid.program_guard(self.infer_program, self.startup_program): - with fluid.unique_name.guard(): - self.infer_feed_dict = inputs = self._get_feed_dict(is_infer=True) - outputs = self.forward(inputs, is_infer=True) - predictions = self.infer(inputs, outputs) - self.infer_fetch_dict = predictions - self.infer_program = self.infer_program.clone(for_test=True) - - self.program = self.infer_program - else: - if self.is_distributed: - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.use_experimental_executor = True - exec_strategy.num_threads = 4 - exec_strategy.num_iteration_per_drop_scope = 1 - - dist_strategy = DistributedStrategy() - dist_strategy.exec_strategy = exec_strategy - dist_strategy.nccl_comm_num = 1 - dist_strategy.fuse_all_reduce_ops = True - if self.use_recompute: - dist_strategy.forward_recompute = True - dist_strategy.enable_sequential_execution = True - if self.use_amp: - dist_strategy.use_amp = True - dist_strategy.amp_loss_scaling = self.amp_loss_scaling - self.dist_strategy = dist_strategy - - self.startup_program = fluid.Program() - # build train program - self.train_program = fluid.Program() - with fluid.program_guard(self.train_program, self.startup_program): - with fluid.unique_name.guard(): - self.feed_dict = inputs = self._get_feed_dict() - outputs = self.forward(inputs) - if self.is_distributed and self.use_recompute: - self.dist_strategy.recompute_checkpoints = outputs["checkpoints"] - metrics, statistics = self.get_metrics_and_statistics(inputs, outputs) - - # build eval program - self.eval_program = self.train_program.clone(for_test=True) - self.eval_fetch_dict = {**metrics, **statistics} - - scheduled_lr = self.optimize(metrics) - metrics["scheduled_lr"] = scheduled_lr - self.train_fetch_dict = metrics - - self.program = self.train_program - if self.is_distributed: - self.train_program = fleet.main_program - - self.exe.run(self.startup_program) - if self.init_pretraining_params != "": - init_pretraining_params(self.exe, self.init_pretraining_params, self.program) - elif self.init_checkpoint != "": - init_checkpoint(self.exe, self.init_checkpoint, self.program) - return - - def load(self, model_dir, is_checkpoint=False): - """ - Load persistables or parameters. - """ - # TODO: support dygraph. - if is_checkpoint: - init_checkpoint(self.exe, model_dir, self.program) - else: - init_pretraining_params(self.exe, model_dir, self.program) - return - - def save(self, model_dir, is_checkpoint=False): - """ - Save persistables or parameters. - """ - # TODO: support dygraph. - if is_checkpoint: - fluid.io.save_persistables(self.exe, model_dir, self.program) - else: - fluid.io.save_params(self.exe, model_dir, self.program) - return - - @abstractmethod - def _get_feed_dict(self, is_infer=False): - """ - Return input feed list. - """ - pass - - def _get_feed(self, inputs, is_infer=False): - """ - Convert `inputs` into model's feed data format. - """ - if isinstance(inputs, list): - # return list direclty which is used in `get_data_loader`. - return inputs - for k in inputs: - if isinstance(inputs[k], list): - inputs[k] = to_lodtensor(inputs[k], self.place) - return inputs - - def get_data_loader(self, generator=None, is_infer=False): - """ - Return DataLoader. - - If generator is not `None`, the data loader set it as the batch generator. - """ - # TODO: support dygraph. - if is_infer: - feed_name_list, feed_list = zip(*self.infer_feed_dict.items()) - else: - feed_name_list, feed_list = zip(*self.feed_dict.items()) - loader = fluid.io.DataLoader.from_generator( - feed_list=feed_list, capacity=64, use_double_buffer=True, iterable=True) - if generator is not None: - - def __wrapper__(): - for batch in generator(): - batch = self._get_feed(batch) - batch = [batch[name] for name in feed_name_list if name in batch] - yield batch - - loader.set_batch_generator(__wrapper__, self.place) - return loader - - @abstractmethod - def forward(self, inputs, is_infer=False): - """ - Run model main forward. - """ - pass - - @abstractmethod - def get_metrics_and_statistics(self, inputs, outputs): - """ - Get metrics and statistics. - """ - pass - - @abstractmethod - def infer(self, inputs, outputs): - """ - Run model inference. - """ - pass - - def optimize(self, metrics): - """ - Optimize the model by metrics(mainly `metrics["loss"]`). - """ - # TODO: support dygraph - if self.warmup_steps > 0: - scheduled_lr = layers.learning_rate_scheduler.noam_decay(1 / (self.warmup_steps * (self.learning_rate**2)), - self.warmup_steps) - else: - scheduled_lr = layers.create_global_var( - name=fluid.unique_name.generate("learning_rate"), - shape=[1], - value=self.learning_rate, - dtype="float32", - persistable=True) - grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm) - - self.optimizer = AdamW(learning_rate=scheduled_lr, grad_clip=grad_clip, weight_decay=self.weight_decay) - - if self.is_distributed: - self.optimizer = fleet.distributed_optimizer(self.optimizer, strategy=self.dist_strategy) - - self.optimizer.minimize(metrics["loss"]) - return scheduled_lr - - def _execute(self, program, feed, fetch_dict, **kwargs): - """ - Execute program. - """ - fetch_list = [var.name for var in fetch_dict.values()] - fetch_vars = self.exe.run(program, feed, fetch_list, **kwargs) - return dict(zip(fetch_dict.keys(), fetch_vars)) - - def train_step(self, inputs): - """ - Run one training step. - """ - # TODO: support dygraph. - return self._execute(self.train_program, self._get_feed(inputs), self.train_fetch_dict, use_program_cache=True) - - def eval_step(self, inputs): - """ - Run one evaluation step. - """ - # TODO: support dygraph. - return self._execute(self.eval_program, self._get_feed(inputs), self.eval_fetch_dict) - - def infer_step(self, inputs): - """ - Run one inference step. - """ - # TODO: support dygraph. - return self._execute(self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict) - - def save_inference_model(self, inference_model_path): - """ - Save the inference model. - """ - feed_list = [var.name for var in self.infer_feed_dict.values()] - fetch_list = list(self.infer_fetch_dict.values()) - - fluid.io.save_inference_model(inference_model_path, feed_list, fetch_list, self.exe, self.infer_program) diff --git a/modules/text/text_generation/plato2_en_large/models/nsp_model.py b/modules/text/text_generation/plato2_en_large/models/nsp_model.py deleted file mode 100644 index 8c8f2d9a..00000000 --- a/modules/text/text_generation/plato2_en_large/models/nsp_model.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NSP model.""" - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - -from . import register_model -from .model_base import Model -from .unified_transformer import UnifiedTransformer - - -@register_model("NSPModel") -class NSPModel(UnifiedTransformer): - """NSP model.""" - - def _get_feed_dict(self, is_infer=False): - """ - Get the feed list of the model. - - Args: - is_infer(bool): True if running inference. - - Returns: - list(Variable): The feed list. - list(str): The name of each Variable in feed list. - """ - feed_dict = {} - feed_dict["token_ids"] = layers.data(name="token_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - feed_dict["type_ids"] = layers.data(name="type_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - feed_dict["pos_ids"] = layers.data(name="pos_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - - feed_dict["attention_mask"] = layers.data( - name="attention_mask", shape=[-1, self.max_seq_len, self.max_seq_len], dtype=self.dtype) - feed_dict["label_pos"] = layers.data(name="label_pos", shape=[-1, 1], dtype="int64") - - if not is_infer: - feed_dict["label"] = layers.data(name="label", shape=[-1, 1], dtype="int64") - feed_dict["tgt_label"] = layers.data(name="tgt_ids", shape=[-1, 1], dtype="int64") - feed_dict["tgt_pos"] = layers.data(name="tgt_pos", shape=[-1, 1], dtype="int64") - - feed_dict["data_id"] = layers.data(name="data_id", shape=[-1, 1], dtype="int64") - return feed_dict - - def _get_feed(self, inputs, is_infer=False): - return Model._get_feed(self, inputs, is_infer) - - def forward(self, inputs, is_infer=False): - outputs = {} - self.generation_caches = None - outputs["enc_out"], self.checkpoints = self._generation_network( - token_ids=inputs["token_ids"], - type_ids=inputs["type_ids"], - pos_ids=inputs["pos_ids"], - generation_mask=inputs["attention_mask"]) - return outputs - - def _get_metrics(self, inputs, outputs): - metrics = {} - fc_out = self._calc_logits(outputs["enc_out"], inputs["tgt_pos"]) - lm_loss = layers.softmax_with_cross_entropy(logits=fc_out, label=inputs["tgt_pos"]) - need_cal = layers.not_equal(inputs["tgt_label"], layers.fill_constant(shape=[1], dtype="int64", value=1)) - need_cal = layers.cast(need_cal, self.dtype) - mean_lm_loss = layers.reduce_sum(lm_loss * need_cal) / (layers.reduce_sum(need_cal) + 1e-10) - - pooled_out = self._get_pooled_output(outputs["enc_out"], inputs["label_pos"]) - nsp_fc_out = layers.fc( - input=pooled_out, - size=2, - param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", initializer=self.param_initializer), - bias_attr="next_sent_fc.b_0") - nsp_loss, nsp_softmax = layers.softmax_with_cross_entropy( - logits=nsp_fc_out, label=inputs["label"], return_softmax=True) - - nsp_acc = layers.accuracy(nsp_softmax, inputs["label"]) - mean_nsp_loss = layers.mean(nsp_loss) - - metrics["loss"] = mean_lm_loss + mean_nsp_loss - metrics["lm_loss"] = mean_lm_loss - metrics["nsp_loss"] = mean_nsp_loss - metrics["nsp_acc"] = nsp_acc - return metrics - - def infer(self, inputs, outputs): - pooled_out = self._get_pooled_output(outputs["enc_out"], inputs["label_pos"]) - nsp_fc_out = layers.fc( - input=pooled_out, - size=2, - param_attr=fluid.ParamAttr(name="next_sent_fc.w_0", initializer=self.param_initializer), - bias_attr="next_sent_fc.b_0") - scores = layers.softmax(nsp_fc_out) - predictions = {"scores": scores, "data_id": inputs["data_id"]} - return predictions - - def infer_step(self, inputs): - return Model.infer_step(self, inputs) diff --git a/modules/text/text_generation/plato2_en_large/models/optimizer.py b/modules/text/text_generation/plato2_en_large/models/optimizer.py deleted file mode 100644 index 0381f123..00000000 --- a/modules/text/text_generation/plato2_en_large/models/optimizer.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Optimizer.""" - -import re - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -class AdamW(fluid.optimizer.AdamOptimizer): - """AdamW object for dygraph""" - - def __init__(self, *args, **kwargs): - weight_decay = kwargs.pop('weight_decay', None) - var_name_to_exclude = kwargs.pop('var_name_to_exclude', '.*layer_norm_scale|.*layer_norm_bias|.*b_0') - super(AdamW, self).__init__(*args, **kwargs) - self.wd = weight_decay - self.pat = re.compile(var_name_to_exclude) - - def apply_optimize(self, loss, startup_program, params_grads): - """Update params with weight decay.""" - super(AdamW, self).apply_optimize(loss, startup_program, params_grads) - for p, g in params_grads: - if not self.pat.match(p.name): - layers.assign(p * (1. - self.wd * self._learning_rate), p) diff --git a/modules/text/text_generation/plato2_en_large/models/plato.py b/modules/text/text_generation/plato2_en_large/models/plato.py deleted file mode 100644 index 987a42d7..00000000 --- a/modules/text/text_generation/plato2_en_large/models/plato.py +++ /dev/null @@ -1,241 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Plato model.""" - -import numpy as np -import paddle.fluid as fluid -import paddle.fluid.layers as layers - -from . import register_model -from .model_base import Model -from .unified_transformer import UnifiedTransformer -from .transformer_block import encoder, pre_process_layer -from plato2_en_large.utils import repeat_array_or_tensor -from plato2_en_large.utils.args import str2bool -from .generator import Generator - - -@register_model("Plato") -class Plato(UnifiedTransformer): - """Plato model.""" - - @classmethod - def add_cmdline_args(cls, parser): - """Add cmdline argurments.""" - group = UnifiedTransformer.add_cmdline_args(parser) - group.add_argument("--use_bow", type=str2bool, default=True) - group.add_argument("--use_entropy", type=str2bool, default=False) - return group - - def __init__(self, args, place): - # latent related - self.mask_id = args.mask_id - self.latent_type_size = args.latent_type_size - self.latent_emb_name = "latent_embedding" - self.use_bow = args.use_bow - self.use_entropy = args.use_entropy - - super(Plato, self).__init__(args, place) - - def _get_feed_dict(self, is_infer=False): - feed_dict = {} - feed_dict["token_ids"] = layers.data(name="token_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - feed_dict["type_ids"] = layers.data(name="type_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - feed_dict["pos_ids"] = layers.data(name="pos_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - - if not is_infer: - feed_dict["recognition_mask"] = layers.data( - name="recognition_mask", shape=[-1, self.max_seq_len + 1, self.max_seq_len + 1], dtype=self.dtype) - feed_dict["generation_mask"] = layers.data( - name="generation_mask", shape=[-1, self.max_seq_len + 1, self.max_seq_len + 1], dtype=self.dtype) - - if is_infer: - feed_dict["tgt_ids"] = layers.data( - name="tgt_ids", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2) - feed_dict["tgt_pos"] = layers.data( - name="tgt_pos", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2) - feed_dict["init_score"] = layers.data(name="init_score", shape=[-1, 1], dtype="float32", lod_level=1) - feed_dict["parent_idx"] = layers.data(name="parent_idx", shape=[-1], dtype="int64") - - feed_dict["tgt_generation_mask"] = layers.data( - name="tgt_generation_mask", shape=[-1, 1, self.max_seq_len + 1], dtype="float32") - feed_dict["latent_id"] = layers.data(name="latent_id", shape=[-1, 1], dtype="int64") - else: - feed_dict["tgt_label"] = layers.data(name="tgt_label", shape=[-1, 1], dtype="int64") - feed_dict["tgt_pos"] = layers.data(name="tgt_pos", shape=[-1, 1], dtype="int64") - - if self.use_bow: - feed_dict["bow_label"] = layers.data(name="bow_label", shape=[-1, 1], dtype="int64") - feed_dict["bow_pos"] = layers.data(name="bow_pos", shape=[-1, 1], dtype="int64") - - feed_dict["data_id"] = layers.data(name="data_id", shape=[-1, 1], dtype="int64") - return feed_dict - - def _recognition_network(self, token_ids, type_ids, pos_ids, recognition_mask): - mask_id = layers.fill_constant_batch_size_like( - input=token_ids, shape=[-1, 1, 1], value=self.mask_id, dtype="int64") - mask_emb = layers.embedding( - input=mask_id, - size=[self.vocab_size, self.emb_size], - dtype=self.dtype, - param_attr=fluid.ParamAttr(name=self.token_emb_name, initializer=self.param_initializer)) - emb_out, n_head_self_attn_mask = self._gen_input( - token_ids, type_ids, pos_ids, recognition_mask, aux_emb=mask_emb) - - recognition_out, checkpoints = self._encode(emb_out, n_head_self_attn_mask) - - recognition_feat = layers.slice(input=recognition_out, axes=[1], starts=[0], ends=[1]) - recognition_feat = layers.fc( - input=recognition_feat, - size=self.hidden_size, - act="tanh", - param_attr=fluid.ParamAttr(name="recognition_fc.w_0", initializer=self.param_initializer), - bias_attr="recognition_fc.b_0") - logits = layers.fc( - input=recognition_feat, - size=self.latent_type_size, - param_attr=fluid.ParamAttr(name=self.latent_emb_name, initializer=self.param_initializer), - bias_attr="recognition_bias") - return logits, checkpoints - - def _gumbel_softmax(self, logits, tau=0.67, eps=1e-10): - u = layers.uniform_random_batch_size_like(logits, shape=[-1, self.latent_type_size], min=0.0, max=1.0) - u.stop_gradient = True - gumbel = 0.0 - layers.log(eps - layers.log(u + eps)) - y = logits + gumbel - return layers.softmax(y / tau) - - def forward(self, inputs, is_infer=False): - """ - Run model main forward. - """ - outputs = {} - if is_infer: - self.generation_caches = [{ - "k": - layers.fill_constant_batch_size_like( - input=inputs["token_ids"], shape=[-1, 0, self.d_key * self.n_head], dtype=self.dtype, value=0), - "v": - layers.fill_constant_batch_size_like( - input=inputs["token_ids"], shape=[-1, 0, self.d_value * self.n_head], dtype=self.dtype, value=0), - } for i in range(self.n_layer)] - else: - self.generation_caches = None - - latent_embeddings = layers.create_parameter( - shape=[self.emb_size, self.latent_type_size], - dtype=self.dtype, - attr=fluid.ParamAttr(name=self.latent_emb_name, initializer=self.param_initializer)) - - if is_infer: - latent_id = inputs["latent_id"] - weights = layers.one_hot(latent_id, self.latent_type_size) - else: - logits, recognition_checkpoints = self._recognition_network( - token_ids=inputs["token_ids"], - type_ids=inputs["type_ids"], - pos_ids=inputs["pos_ids"], - recognition_mask=inputs["recognition_mask"], - ) - outputs["post_probs"] = layers.softmax(logits) - weights = self._gumbel_softmax(logits) - outputs["checkpoints"] = recognition_checkpoints - - latent_emb = layers.matmul(x=weights, y=latent_embeddings, transpose_y=True) - outputs["enc_out"], generation_checkpoints = self._generation_network( - token_ids=inputs["token_ids"], - type_ids=inputs["type_ids"], - pos_ids=inputs["pos_ids"], - generation_mask=inputs["generation_mask"], - aux_emb=layers.unsqueeze(latent_emb, axes=[1]), - gather_idx=inputs.get("parent_idx", None), - ) - - if not is_infer: - outputs["checkpoints"].extend(generation_checkpoints) - return outputs - - def _calc_bow_logits(self, enc_out, checkpoints, bow_pos): - """Get the logits of generation.""" - bow_feat = layers.slice(input=enc_out, axes=[1], starts=[0], ends=[1]) - bow_feat = layers.reshape(x=bow_feat, shape=[-1, self.hidden_size]) - bow_pos = layers.cast(x=bow_pos, dtype="int32") - bow_feat = layers.gather(input=bow_feat, index=bow_pos) - - bow_trans_feat = layers.fc( - input=bow_feat, - size=self.emb_size, - act=self.hidden_act, - param_attr=fluid.ParamAttr(name="bow_trans_fc.w_0", initializer=self.param_initializer), - bias_attr=fluid.ParamAttr(name="bow_trans_fc.b_0")) - - bow_trans_feat = pre_process_layer(bow_trans_feat, self.post_cls_cmd, name="bow_trans") - - checkpoints.append(bow_trans_feat) - - if self.weight_sharing: - fc_out = layers.matmul( - x=bow_trans_feat, - y=fluid.default_main_program().global_block().var(self.token_emb_name), - transpose_y=True) - if self.cls_bias: - fc_out += layers.create_parameter( - shape=[self.vocab_size], - dtype=self.dtype, - attr=fluid.ParamAttr(name="bow_out_fc.b_0"), - is_bias=True) - else: - bow_out_bias_attr = fluid.ParamAttr(name="bow_out_fc.b_0") if self.cls_bias else False - fc_out = layers.fc( - input=bow_trans_feat, - size=self.vocab_size, - param_attr=fluid.ParamAttr(name="bow_out_fc.w_0", initializer=self.param_initializer), - bias_attr=bow_out_bias_attr) - return fc_out - - def _get_metrics(self, inputs, outputs): - metrics = super(Plato, self)._get_metrics(inputs, outputs) - - if self.use_bow: - fc_out = self._calc_bow_logits(outputs["enc_out"], outputs["checkpoints"], inputs["bow_pos"]) - bow_loss = layers.softmax_with_cross_entropy(logits=fc_out, label=inputs["bow_label"]) - mean_bow_loss = layers.mean(bow_loss) - metrics["token_bow_loss"] = mean_bow_loss - metrics["loss"] = metrics["loss"] + mean_bow_loss - - entropy_loss = layers.reduce_sum(outputs["post_probs"] * layers.log(outputs["post_probs"]), dim=1) - mean_entropy_loss = layers.mean(entropy_loss) - metrics["entropy_loss"] = mean_entropy_loss - if self.use_entropy: - metrics["loss"] = metrics["loss"] + mean_entropy_loss - return metrics - - def infer_step(self, inputs): - """ - Run one inference step. - """ - if self.do_generation: - batch_size = len(inputs["data_id"]) - new_bsz = batch_size * self.latent_type_size - inputs = { - name: repeat_array_or_tensor(array_or_tensor, self.place, self.latent_type_size) - for name, array_or_tensor in inputs.items() - } - # Add latent_id - inputs["latent_id"] = np.array([i for i in range(self.latent_type_size) for _ in range(batch_size)], - dtype="int64").reshape([-1, 1]) - - return super(Plato, self).infer_step(inputs) - else: - return self._execute(self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict) diff --git a/modules/text/text_generation/plato2_en_large/models/transformer_block.py b/modules/text/text_generation/plato2_en_large/models/transformer_block.py deleted file mode 100644 index 041306a5..00000000 --- a/modules/text/text_generation/plato2_en_large/models/transformer_block.py +++ /dev/null @@ -1,332 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Transformer block.""" - -from functools import partial - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - gather_idx=None, - store=False, - param_initializer=None, - name="multi_head_att"): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): - raise ValueError("Inputs: quries, keys and values should all be 3-D tensors.") - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc( - input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + "_query_fc.w_0", initializer=param_initializer), - bias_attr=name + "_query_fc.b_0") - k = layers.fc( - input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + "_key_fc.w_0", initializer=param_initializer), - bias_attr=name + "_key_fc.b_0") - v = layers.fc( - input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + "_value_fc.w_0", initializer=param_initializer), - bias_attr=name + "_value_fc.b_0") - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - if attn_bias: - product += attn_bias - weights = layers.softmax(product, use_cudnn=True) - if dropout_rate: - weights = layers.dropout( - weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - cache_k, cache_v = cache["k"], cache["v"] - select_k = layers.gather(cache_k, index=gather_idx) - select_v = layers.gather(cache_v, index=gather_idx) - select_k = layers.reshape(select_k, shape=[0, 0, d_key * n_head]) - select_v = layers.reshape(select_v, shape=[0, 0, d_value * n_head]) - if store: - k = layers.concat([select_k, k], axis=1) - v = layers.concat([select_v, v], axis=1) - layers.assign(k, cache["k"]) - layers.assign(v, cache["v"]) - else: - #k = select_k - #v = select_v - tmp_k = layers.concat([select_k, k[:, :1]], axis=1) - tmp_v = layers.concat([select_v, v[:, :1]], axis=1) - layers.assign(tmp_k, cache["k"]) - layers.assign(tmp_v, cache["v"]) - k = layers.concat([select_k, k], axis=1) - v = layers.concat([select_v, v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc( - input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + "_output_fc.w_0", initializer=param_initializer), - bias_attr=name + "_output_fc.b_0") - return proj_out - - -def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name="ffn"): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - hidden = layers.fc( - input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=hidden_act, - param_attr=fluid.ParamAttr(name=name + "_fc_0.w_0", initializer=param_initializer), - bias_attr=name + "_fc_0.b_0") - if dropout_rate: - hidden = layers.dropout( - hidden, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) - out = layers.fc( - input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr(name=name + "_fc_1.w_0", initializer=param_initializer), - bias_attr=name + "_fc_1.b_0") - return out - - -def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0., epsilon=1e-5, name=""): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out = layers.layer_norm( - out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr(name=name + "_layer_norm_scale", initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr(name=name + "_layer_norm_bias", initializer=fluid.initializer.Constant(0.)), - epsilon=epsilon) - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout( - out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def encoder_layer(input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name="", - epsilon=1e-5, - cache=None, - gather_idx=None, - store=False): - """ - The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the pre_process_layer / post_process_layer to add residual connection, - layer normalization and droput. - """ - attn_output = multi_head_attention( - pre_process_layer(input, preprocess_cmd, prepostprocess_dropout, epsilon=epsilon, name=name + "_pre_att"), - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + "_multi_head_att", - cache=cache, - gather_idx=gather_idx, - store=store) - attn_output = post_process_layer( - input, attn_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_att", epsilon=epsilon) - ffd_output = positionwise_feed_forward( - pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout, epsilon=epsilon, name=name + "_pre_ffn"), - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + "_ffn") - ffd_output = post_process_layer( - attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_ffn", epsilon=epsilon) - return ffd_output, [attn_output, ffd_output] - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name="", - epsilon=1e-5, - n_layer_per_block=1, - param_share="normal", - caches=None, - gather_idx=None, - store=False): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer. - """ - checkpoints = [] - names = [] - if param_share == "inner_share": - for _ in range(n_layer // n_layer_per_block): - for i in range(n_layer_per_block): - names.append(name + "_layer_" + str(i)) - else: - for i in range(n_layer // n_layer_per_block): - for _ in range(n_layer_per_block): - names.append(name + "_layer_" + str(i)) - - for i in range(n_layer): - enc_output, cps = encoder_layer( - enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - epsilon=epsilon, - name=names[i], - cache=caches[i] if caches is not None else None, - gather_idx=gather_idx, - store=store) - checkpoints.extend(cps) - enc_input = enc_output - enc_output = pre_process_layer( - enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder", epsilon=epsilon) - - return enc_output, checkpoints diff --git a/modules/text/text_generation/plato2_en_large/models/unified_transformer.py b/modules/text/text_generation/plato2_en_large/models/unified_transformer.py deleted file mode 100644 index c4ebf115..00000000 --- a/modules/text/text_generation/plato2_en_large/models/unified_transformer.py +++ /dev/null @@ -1,378 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Unified Transformer model.""" - -import numpy as np -import paddle.fluid as fluid -import paddle.fluid.layers as layers - -from . import register_model -from .model_base import Model -from .transformer_block import encoder, pre_process_layer -from plato2_en_large.utils.args import str2bool -from plato2_en_large.utils import repeat_array_or_tensor, slice_array_or_tensor -from .generator import Generator - - -@register_model("UnifiedTransformer") -class UnifiedTransformer(Model): - """Unified Transformer""" - - @classmethod - def add_cmdline_args(cls, parser): - """Add cmdline argurments.""" - group = Model.add_cmdline_args(parser) - group.add_argument("--max_seq_len", type=int, default=256) - group.add_argument("--weight_sharing", type=str2bool, default=True) - group.add_argument("--mem_efficient", type=str2bool, default=False) - - Generator.add_cmdline_args(parser) - return group - - def __init__(self, args, place): - self.max_seq_len = args.max_seq_len - - self.emb_size = args.emb_size or args.hidden_size - self.hidden_size = args.hidden_size - - self.n_layer = args.num_hidden_layers - self.n_head = args.num_attention_heads - self.d_key = args.get("key_size", self.hidden_size // self.n_head) - self.d_value = args.get("value_size", self.hidden_size // self.n_head) - self.inner_hidden_size = args.get("inner_hidden_size", self.hidden_size * 4) - - self.vocab_size = args.vocab_size - self.max_position_seq_len = args.max_position_embeddings - self.type_size = args.type_vocab_size - self.token_emb_name = "word_embedding" - self.type_emb_name = "sent_embedding" - self.pos_emb_name = "pos_embedding" - - self.epsilon = args.epsilon or 1e-5 - self.n_layer_per_block = args.n_layer_per_block or 1 - self.pre_encoder_cmd = args.get("pre_encoder_cmd", "nd") - self.preprocess_cmd = args.get("preprocess_cmd", "") - self.postprocess_cmd = args.get("postprocess_cmd", "dan") - self.post_cls_cmd = args.get("post_cls_cmd", "n") - self.cls_bias = args.get("cls_bias", True) - if self.hidden_size != self.emb_size: - self.emb_mapping_in = True - else: - self.emb_mapping_in = args.get("emb_mapping_in", False) - - self.hidden_act = args.hidden_act - self.prepostprocess_dropout = args.hidden_dropout_prob - self.attention_dropout = args.attention_probs_dropout_prob - self.weight_sharing = args.weight_sharing - - self.mem_efficient = args.mem_efficient - - self.dtype = "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self.param_initializer = fluid.initializer.TruncatedNormal(scale=args.initializer_range) - - # task-related - self.generator = Generator(args) - self.do_generation = args.do_generation - - super(UnifiedTransformer, self).__init__(args, place) - - def _gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None): - token_emb_out = layers.embedding( - input=token_ids, - size=[self.vocab_size, self.emb_size], - dtype=self.dtype, - param_attr=fluid.ParamAttr(name=self.token_emb_name, initializer=self.param_initializer)) - type_emb_out = layers.embedding( - input=type_ids, - size=[self.type_size, self.emb_size], - dtype=self.dtype, - param_attr=fluid.ParamAttr(name=self.type_emb_name, initializer=self.param_initializer)) - pos_emb_out = layers.embedding( - input=pos_ids, - size=[self.max_position_seq_len, self.emb_size], - dtype=self.dtype, - param_attr=fluid.ParamAttr(name=self.pos_emb_name, initializer=self.param_initializer)) - emb_out = token_emb_out + type_emb_out + pos_emb_out - - # auxiliary memory embeddings - if aux_emb is not None: - emb_out = layers.concat([aux_emb, emb_out], axis=1) - - # post process of embedding - emb_out = pre_process_layer( - emb_out, self.pre_encoder_cmd, self.prepostprocess_dropout, name="pre_encoder", epsilon=self.epsilon) - if self.emb_mapping_in: - emb_out = layers.fc( - input=emb_out, - num_flatten_dims=2, - size=self.hidden_size, - param_attr=fluid.ParamAttr(name="emb_hidden_mapping", initializer=self.param_initializer), - bias_attr="emb_hidden_mapping_bias") - - # generate n-head self-attention mask - self_attn_mask = input_mask - self_attn_mask = layers.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = layers.stack(x=[self_attn_mask] * self.n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - return emb_out, n_head_self_attn_mask - - def _get_pooled_output(self, enc_out, pos): - enc_out = layers.reshape(x=enc_out, shape=[-1, self.hidden_size]) - pos = layers.cast(x=pos, dtype="int32") - feat = layers.gather(input=enc_out, index=pos) - - pooled_out = layers.fc( - input=feat, - size=self.hidden_size, - act="tanh", - param_attr=fluid.ParamAttr(name="pooled_fc.w_0", initializer=self.param_initializer), - bias_attr="pooled_fc.b_0") - return pooled_out - - def _generation_network(self, token_ids, type_ids, pos_ids, generation_mask, aux_emb=None, gather_idx=None): - emb_out, n_head_self_attn_mask = self._gen_input(token_ids, type_ids, pos_ids, generation_mask, aux_emb=aux_emb) - return self._encode(emb_out, n_head_self_attn_mask, self.generation_caches, gather_idx=gather_idx) - - def _encode(self, emb_out, n_head_self_attn_mask, caches=None, gather_idx=None): - return encoder( - enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self.n_layer, - n_head=self.n_head, - d_key=self.d_key, - d_value=self.d_value, - d_model=self.hidden_size, - d_inner_hid=self.inner_hidden_size, - prepostprocess_dropout=self.prepostprocess_dropout, - attention_dropout=self.attention_dropout, - relu_dropout=0, - hidden_act=self.hidden_act, - preprocess_cmd=self.preprocess_cmd, - postprocess_cmd=self.postprocess_cmd, - param_initializer=self.param_initializer, - epsilon=self.epsilon, - n_layer_per_block=self.n_layer_per_block, - name="encoder", - caches=caches, - gather_idx=gather_idx, - store=caches is not None) - - def _gumbel_softmax(self, logits, tau=0.67, eps=1e-10): - u = layers.uniform_random_batch_size_like(logits, shape=[-1, self.latent_type_size], min=0.0, max=1.0) - u.stop_gradient = True - gumbel = 0.0 - layers.log(eps - layers.log(u + eps)) - y = logits + gumbel - return layers.softmax(y / tau) - - def _get_feed_dict(self, is_infer=False): - """ - Get the feed list of the model. - - Args: - is_infer(bool): True if running inference. - - Returns: - list(Variable): The feed list. - list(str): The name of each Variable in feed list. - """ - feed_dict = {} - feed_dict["token_ids"] = layers.data(name="token_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - feed_dict["type_ids"] = layers.data(name="type_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - feed_dict["pos_ids"] = layers.data(name="pos_ids", shape=[-1, self.max_seq_len, 1], dtype="int64") - - feed_dict["generation_mask"] = layers.data( - name="generation_mask", shape=[-1, self.max_seq_len, self.max_seq_len], dtype=self.dtype) - - if is_infer: - feed_dict["tgt_ids"] = layers.data( - name="tgt_ids", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2) - feed_dict["tgt_pos"] = layers.data( - name="tgt_pos", shape=[-1, self.max_seq_len, 1], dtype="int64", lod_level=2) - feed_dict["init_score"] = layers.data(name="init_score", shape=[-1, 1], dtype="float32", lod_level=1) - feed_dict["parent_idx"] = layers.data(name="parent_idx", shape=[-1], dtype="int64") - - feed_dict["tgt_generation_mask"] = layers.data( - name="tgt_generation_mask", shape=[-1, 1, self.max_seq_len], dtype="float32") - else: - feed_dict["tgt_label"] = layers.data(name="tgt_label", shape=[-1, 1], dtype="int64") - feed_dict["tgt_pos"] = layers.data(name="tgt_pos", shape=[-1, 1], dtype="int64") - - feed_dict["data_id"] = layers.data(name="data_id", shape=[-1, 1], dtype="int64") - return feed_dict - - def forward(self, inputs, is_infer=False): - """ - Run model main forward. - """ - outputs = {} - if is_infer: - self.generation_caches = [{ - "k": - layers.fill_constant_batch_size_like( - input=inputs["token_ids"], shape=[-1, 0, self.d_key * self.n_head], dtype=self.dtype, value=0), - "v": - layers.fill_constant_batch_size_like( - input=inputs["token_ids"], shape=[-1, 0, self.d_value * self.n_head], dtype=self.dtype, value=0), - } for i in range(self.n_layer)] - else: - self.generation_caches = None - - outputs["enc_out"], generation_checkpoints = self._generation_network( - token_ids=inputs["token_ids"], - type_ids=inputs["type_ids"], - pos_ids=inputs["pos_ids"], - generation_mask=inputs["generation_mask"], - gather_idx=inputs.get("parent_idx", None)) - - if not is_infer: - outputs["checkpoints"] = generation_checkpoints - return outputs - - def _calc_logits(self, enc_out, checkpoints=None, seq_pos=None): - """Get the logits of generation.""" - enc_out = layers.reshape(x=enc_out, shape=[-1, self.hidden_size]) - if seq_pos is not None: - seq_pos = layers.cast(x=seq_pos, dtype="int32") - seq_feat = layers.gather(input=enc_out, index=seq_pos) - else: - seq_feat = enc_out - - seq_trans_feat = layers.fc( - input=seq_feat, - size=self.emb_size, - act=self.hidden_act, - param_attr=fluid.ParamAttr(name="mask_lm_trans_fc.w_0", initializer=self.param_initializer), - bias_attr=fluid.ParamAttr(name="mask_lm_trans_fc.b_0")) - - seq_trans_feat = pre_process_layer(seq_trans_feat, self.post_cls_cmd, name="mask_lm_trans") - - if checkpoints is not None: - checkpoints.append(seq_trans_feat) - - if self.weight_sharing: - fc_out = layers.matmul( - x=seq_trans_feat, - y=fluid.default_main_program().global_block().var(self.token_emb_name), - transpose_y=True) - if self.cls_bias: - fc_out += layers.create_parameter( - shape=[self.vocab_size], - dtype=self.dtype, - attr=fluid.ParamAttr(name="mask_lm_out_fc.b_0"), - is_bias=True) - else: - seq_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0") if self.cls_bias else False - fc_out = layers.fc( - input=seq_trans_feat, - size=self.vocab_size, - param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", initializer=self.param_initializer), - bias_attr=seq_out_bias_attr) - return fc_out - - def _get_metrics(self, inputs, outputs): - metrics = {} - - fc_out = self._calc_logits(outputs["enc_out"], outputs["checkpoints"], inputs["tgt_pos"]) - tgt_lm_loss = layers.softmax_with_cross_entropy(logits=fc_out, label=inputs["tgt_label"]) - mean_tgt_lm_loss = layers.mean(tgt_lm_loss) - loss = mean_tgt_lm_loss - metrics["token_lm_loss"] = mean_tgt_lm_loss - - metrics["loss"] = loss - return metrics - - def _get_statistics(self, inputs, outputs): - statistics = {} - if "tgt_label" in inputs: - statistics["tokens_num"] = layers.reduce_sum( - layers.fill_constant_batch_size_like(input=inputs["tgt_label"], value=1.0, shape=[-1], dtype="int64")) - statistics["batch_size"] = layers.reduce_sum( - layers.fill_constant_batch_size_like(input=inputs["token_ids"], value=1.0, shape=[-1], dtype="int64")) - return statistics - - def get_metrics_and_statistics(self, inputs, outputs): - """ - Get metrics and statistics. - """ - metrics = self._get_metrics(inputs, outputs) - statistics = self._get_statistics(inputs, outputs) - return metrics, statistics - - def infer(self, inputs, outputs): - """ - Run model inference. - """ - if self.do_generation: - return self.generator.inference(self, inputs, outputs) - else: - raise NotImplementedError - - def _run_generation(self, inputs): - """ - Run generation. - """ - batch_size = len(inputs["data_id"]) - inputs["parent_idx"] = np.array(range(batch_size), dtype="int64") - outputs = self._execute( - self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict, return_numpy=False) - - predictions = [] - data_id_list = np.array(outputs["data_id"]).reshape(-1).tolist() - token_ids_list = np.array(outputs["token_ids"]).squeeze(2).tolist() - seq_ids = outputs["finished_ids"] - seq_ids_np = np.array(outputs["finished_ids"]) - seq_scores_np = np.array(outputs["finished_scores"]) - for i, (data_id, token_ids) in enumerate(zip(data_id_list, token_ids_list)): - start = seq_ids.lod()[0][i] - end = seq_ids.lod()[0][i + 1] - for j in range(start, end): - sub_start = seq_ids.lod()[1][j] - sub_end = seq_ids.lod()[1][j + 1] - info = {} - info["data_id"] = data_id - info["decode_score"] = float(seq_scores_np[sub_end - 1]) - info["context_token_ids"] = token_ids - info["response_token_ids"] = seq_ids_np[sub_start:sub_end].tolist() - predictions.append(info) - return predictions - - def infer_step(self, inputs): - """ - Run one inference step. - """ - if self.do_generation: - if self.generator.num_samples: - inputs = { - name: repeat_array_or_tensor(array_or_tensor, self.place, self.generator.num_samples) - for name, array_or_tensor in inputs.items() - } - - if self.mem_efficient: - predictions = [] - for idx in range(0, len(inputs["data_id"]), self.batch_size): - part_inputs = { - name: slice_array_or_tensor(array_or_tensor, self.place, idx, idx + self.batch_size) - for name, array_or_tensor in inputs.items() - } - part_outputs = self._run_generation(part_inputs) - predictions.extend(part_outputs) - else: - predictions = self._run_generation(inputs) - return predictions - else: - return self._execute(self.infer_program, self._get_feed(inputs, is_infer=True), self.infer_fetch_dict) diff --git a/modules/text/text_generation/plato2_en_large/module.py b/modules/text/text_generation/plato2_en_large/module.py index 61e684db..ffc3fa27 100644 --- a/modules/text/text_generation/plato2_en_large/module.py +++ b/modules/text/text_generation/plato2_en_large/module.py @@ -12,83 +12,102 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import ast -import os -import json -import sys import argparse import contextlib +import os +import sys from collections import namedtuple -import paddle.fluid as fluid +import paddle +import paddle.nn as nn + import paddlehub as hub +from .model import Plato2InferModel +from .readers.nsp_reader import NSPReader +from .readers.plato_reader import PlatoReader +from .utils import gen_inputs +from .utils.args import parse_args +from .utils.args import str2bool +from paddlehub.module.module import moduleinfo from paddlehub.module.module import runnable +from paddlehub.module.module import serving from paddlehub.module.nlp_module import DataFormatError -from paddlehub.common.logger import logger -from paddlehub.module.module import moduleinfo, serving - -import plato2_en_large.models as plato_models -from plato2_en_large.tasks.dialog_generation import DialogGeneration -from plato2_en_large.utils import check_cuda, Timer -from plato2_en_large.utils.args import parse_args @moduleinfo( name="plato2_en_large", - version="1.0.0", + version="1.1.0", summary= "A novel pre-training model for dialogue generation, incorporated with latent discrete variables for one-to-many relationship modeling.", author="baidu-nlp", author_email="", type="nlp/text_generation", ) -class Plato(hub.NLPPredictionModule): - def _initialize(self): +class Plato2(nn.Layer, hub.NLPPredictionModule): + + def __init__(self): """ initialize with the necessary elements """ - if "CUDA_VISIBLE_DEVICES" not in os.environ: - raise RuntimeError("The module only support GPU. Please set the environment variable CUDA_VISIBLE_DEVICES.") - + super(Plato2, self).__init__() args = self.setup_args() - self.task = DialogGeneration(args) - self.model = plato_models.create_model(args, fluid.CUDAPlace(0)) + + if args.num_layers == 24: + n_head = 16 + hidden_size = 1024 + elif args.num_layers == 32: + n_head = 32 + hidden_size = 2048 + else: + raise ValueError('The pre-trained model only support 24 or 32 layers, ' + 'but received num_layers=%d.' % args.num_layers) + + self.plato_reader = PlatoReader(args) + nsp_reader = NSPReader(args) + self.model = Plato2InferModel(nsp_reader, args.num_layers, n_head, hidden_size) + state_dict = paddle.load(args.init_from_ckpt) + self.model.set_state_dict(state_dict) + self.model.eval() self.Example = namedtuple("Example", ["src", "data_id"]) + self.latent_type_size = args.latent_type_size self._interactive_mode = False def setup_args(self): """ Setup arguments. """ - assets_path = os.path.join(self.directory, "assets") - vocab_path = os.path.join(assets_path, "vocab.txt") - init_pretraining_params = os.path.join(assets_path, "32L", "Plato") - spm_model_file = os.path.join(assets_path, "spm.model") - nsp_inference_model_path = os.path.join(assets_path, "32L", "NSP") - config_path = os.path.join(assets_path, "32L.json") + ckpt_path = os.path.join(self.directory, 'assets', '32L.pdparams') + vocab_path = os.path.join(self.directory, 'assets', 'vocab.txt') + spm_model_file = os.path.join(self.directory, 'assets', 'spm.model') # ArgumentParser.parse_args use argv[1:], it will drop the first one arg, so the first one in sys.argv should be "" sys.argv = [ - "", "--model", "Plato", "--vocab_path", - "%s" % vocab_path, "--do_lower_case", "False", "--init_pretraining_params", - "%s" % init_pretraining_params, "--spm_model_file", - "%s" % spm_model_file, "--nsp_inference_model_path", - "%s" % nsp_inference_model_path, "--ranking_score", "nsp_score", "--do_generation", "True", "--batch_size", - "1", "--config_path", - "%s" % config_path + "--empty", + "--spm_model_file", + "%s" % spm_model_file, + "--vocab_path", + "%s" % vocab_path, ] parser = argparse.ArgumentParser() - plato_models.add_cmdline_args(parser) - DialogGeneration.add_cmdline_args(parser) - args = parse_args(parser) + group = parser.add_argument_group("Model") + group.add_argument("--init_from_ckpt", type=str, default=ckpt_path) + group.add_argument("--vocab_size", type=int, default=8001) + group.add_argument("--latent_type_size", type=int, default=20) + group.add_argument("--num_layers", type=int, default=32) - args.load(args.config_path, "Model") - args.run_infer = True # only build infer program + group = parser.add_argument_group("Task") + group.add_argument("--is_cn", type=str2bool, default=False) + + NSPReader.add_cmdline_args(parser) + + args = parse_args(parser) + args.batch_size *= args.latent_type_size return args @serving + @paddle.no_grad() def generate(self, texts): """ Get the robot responses of the input texts. @@ -113,10 +132,12 @@ class Plato(hub.NLPPredictionModule): bot_responses = [] for i, text in enumerate(texts): - example = self.Example(src=text.replace("\t", " [SEP] "), data_id=i) - record = self.task.reader._convert_example_to_record(example, is_infer=True) - data = self.task.reader._pad_batch_records([record], is_infer=True) - pred = self.task.infer_step(self.model, data)[0] # batch_size is 1 + example = self.Example(src=text.replace("\t", " [SEP] "), data_id=0) + record = self.plato_reader._convert_example_to_record(example, is_infer=True) + data = self.plato_reader._pad_batch_records([record], is_infer=True) + inputs = gen_inputs(data, self.latent_type_size) + inputs['tgt_ids'] = inputs['tgt_ids'].astype('int64') + pred = self.model(inputs)[0] # batch_size is 1 bot_response = pred["response"] # ignore data_id and score bot_responses.append(bot_response) @@ -144,11 +165,10 @@ class Plato(hub.NLPPredictionModule): """ Run as a command """ - self.parser = argparse.ArgumentParser( - description='Run the %s module.' % self.name, - prog='hub run %s' % self.name, - usage='%(prog)s', - add_help=True) + self.parser = argparse.ArgumentParser(description='Run the %s module.' % self.name, + prog='hub run %s' % self.name, + usage='%(prog)s', + add_help=True) self.arg_input_group = self.parser.add_argument_group(title="Input options", description="Input data. Required") self.arg_config_group = self.parser.add_argument_group( @@ -167,14 +187,3 @@ class Plato(hub.NLPPredictionModule): results = self.generate(texts=input_data) return results - - -if __name__ == "__main__": - module = Plato() - for result in module.generate(["Hello", "Hello\thi, nice to meet you, my name is tom\tso your name is tom?"]): - print(result) - with module.interactive_mode(max_turn=3): - while True: - human_utterance = input() - robot_utterance = module.generate(human_utterance) - print("Robot: %s" % robot_utterance[0]) diff --git a/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py b/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py index 73be362f..ed645e9d 100644 --- a/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py +++ b/modules/text/text_generation/plato2_en_large/readers/dialog_reader.py @@ -17,16 +17,13 @@ import csv from collections import namedtuple from contextlib import contextmanager import gzip -import os import numpy as np -import paddle.fluid as fluid -from paddle.fluid.incubate.fleet.collective import fleet -from plato2_en_large.utils import pad_batch_data -from plato2_en_large.utils.args import str2bool -from plato2_en_large.utils.masking import mask -import plato2_en_large.utils.tokenization as tokenization +from ..utils import pad_batch_data +from ..utils.args import str2bool +from ..utils.masking import mask +from ..utils import tokenization class DialogReader(object): @@ -38,9 +35,17 @@ class DialogReader(object): group = parser.add_argument_group("Reader") group.add_argument("--max_src_len", type=int, default=128) group.add_argument("--max_tgt_len", type=int, default=128) - group.add_argument("--truncate_first_turn", type=str2bool, default=False) - group.add_argument("--file_format", type=str, default="file", choices=["file", "filelist"]) - group.add_argument("--data_format", type=str, default="raw", choices=["raw", "tokenized", "numerical"]) + group.add_argument("--truncate_first_turn", + type=str2bool, + default=False) + group.add_argument("--file_format", + type=str, + default="file", + choices=["file", "filelist"]) + group.add_argument("--data_format", + type=str, + default="raw", + choices=["raw", "tokenized", "numerical"]) group.add_argument("--in_tokens", type=str2bool, default=False) group.add_argument("--batch_size", type=int, default=16) group.add_argument("--continuous_position", type=str2bool, default=True) @@ -48,7 +53,9 @@ class DialogReader(object): group.add_argument("--sort_pool_size", type=int, default=2**16) group = parser.add_argument_group("Tokenizer") - group.add_argument("--tokenizer", type=str, default="SentencePieceTokenizer") + group.add_argument("--tokenizer", + type=str, + default="SentencePieceTokenizer") args, _ = parser.parse_known_args() tokenizer_cls = getattr(tokenization, args.tokenizer) tokenizer_cls.add_cmdline_args(parser) @@ -89,7 +96,9 @@ class DialogReader(object): self.fields += ["tgt_start_idx", "data_id"] self.sort_key = lambda record: [len(record.token_ids)] - self.Record = namedtuple("Record", self.fields, defaults=(None, ) * len(self.fields)) + self.Record = namedtuple("Record", + self.fields, + defaults=(None, ) * len(self.fields)) self.features = {} return @@ -113,7 +122,9 @@ class DialogReader(object): else: s_tokens = self.tokenizer.tokenize(s) - s_token_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + [self.eos_id] + s_token_ids = self.tokenizer.convert_tokens_to_ids(s_tokens) + [ + self.eos_id + ] s_token_ids_list.append(s_token_ids) # trim src @@ -123,9 +134,12 @@ class DialogReader(object): total_token_num += len(s_token_ids_list[idx]) if total_token_num > self.max_src_len: if self.truncate_first_turn and idx == 0: - truncated_ids = s_token_ids_list[idx][:self.max_src_len - total_token_num] + truncated_ids = s_token_ids_list[idx][:self.max_src_len - + total_token_num] if len(truncated_ids) > 1: - s_token_ids_list[idx] = truncated_ids[:-1] + [self.eos_id] + s_token_ids_list[idx] = truncated_ids[:-1] + [ + self.eos_id + ] idx -= 1 break idx -= 1 @@ -180,7 +194,11 @@ class DialogReader(object): tgt_pos_ids = list(range(len(tgt_token_ids))) pos_ids = list(range(len(token_ids))) - field_values = {"token_ids": src_token_ids, "type_ids": src_type_ids, "pos_ids": src_pos_ids} + field_values = { + "token_ids": src_token_ids, + "type_ids": src_type_ids, + "pos_ids": src_pos_ids + } field_values["tgt_start_idx"] = tgt_start_idx field_values["data_id"] = example.data_id @@ -204,7 +222,8 @@ class DialogReader(object): def _read_numerical_file(self, fp, delimiter=";"): for i, line in enumerate(fp): - cols = tokenization.convert_to_unicode(line).strip().split(delimiter) + cols = tokenization.convert_to_unicode(line).strip().split( + delimiter) cols = list(map(lambda x: list(map(int, x.split(" "))), cols)) if len(cols) > self.num_numerical_fields: cols = cols[:self.num_numerical_fields] @@ -213,6 +232,7 @@ class DialogReader(object): yield record def _read_file(self, input_file, phase, is_infer): + def __wrapper__(): with open_file(input_file) as fp: if self.data_format == "numerical": @@ -237,13 +257,18 @@ class DialogReader(object): if phase == "train": self.current_file_index = file_index self.current_file = input_file - file_reader = self._read_file(input_file.strip(), phase, is_infer) + file_reader = self._read_file(input_file.strip(), phase, + is_infer) for record in file_reader(): yield record return __wrapper__ - def _batch_reader(self, reader, phase=None, is_infer=False, sort_pool_size=2**16): + def _batch_reader(self, + reader, + phase=None, + is_infer=False, + sort_pool_size=2**16): """Construct a batch reader.""" def update_max_lens(max_lens, record): @@ -251,7 +276,10 @@ class DialogReader(object): if max_lens is None: return self.sort_key(record) else: - return [max(max_len, l) for max_len, l in zip(max_lens, self.sort_key(record))] + return [ + max(max_len, l) + for max_len, l in zip(max_lens, self.sort_key(record)) + ] def get_batch(reader): """Generate batches from reader.""" @@ -265,7 +293,8 @@ class DialogReader(object): self.current_example += 1 max_lens = update_max_lens(max_lens, record) if self.in_tokens: - to_append = (len(batch) + 1) * sum(max_lens) <= self.batch_size + to_append = (len(batch) + + 1) * sum(max_lens) <= self.batch_size else: to_append = len(batch) < self.batch_size if to_append: @@ -286,7 +315,8 @@ class DialogReader(object): self.current_example += 1 max_lens = update_max_lens(max_lens, record) if self.in_tokens: - to_append = (len(batch) + 1) * sum(max_lens) <= self.batch_size + to_append = (len(batch) + + 1) * sum(max_lens) <= self.batch_size else: to_append = len(batch) < self.batch_size if to_append: @@ -320,7 +350,12 @@ class DialogReader(object): return __wrapper__ - def _distributed_batch_reader(self, batch_reader, num_part, part_id, is_test=False): + def _distributed_batch_reader(self, + batch_reader, + num_part, + part_id, + is_test=False): + def __wrapper__(): batches = [] for batch in batch_reader(): @@ -351,7 +386,8 @@ class DialogReader(object): nonlocal reader if reader is None: if self.file_format == "filelist": - reader = self._read_files(input_file, phase, is_infer, not phase.endswith("test")) + reader = self._read_files(input_file, phase, is_infer, + not phase.endswith("test")) else: if phase == "train": self.total_file = 1 @@ -360,11 +396,18 @@ class DialogReader(object): reader = self._read_file(input_file, phase, is_infer) batch_reader = self._batch_reader( - reader, phase, is_infer, sort_pool_size=self.sort_pool_size if not is_infer else 0) + reader, + phase, + is_infer, + sort_pool_size=self.sort_pool_size if not is_infer else 0) if phase == "train": - batch_reader = self._distributed_batch_reader(batch_reader, num_part, part_id) + batch_reader = self._distributed_batch_reader( + batch_reader, num_part, part_id) elif phase.startswith("distributed"): - batch_reader = self._distributed_batch_reader(batch_reader, num_part, part_id, is_test=True) + batch_reader = self._distributed_batch_reader(batch_reader, + num_part, + part_id, + is_test=True) for epoch_index in range(num_epochs): if phase == "train": @@ -375,20 +418,28 @@ class DialogReader(object): return __wrapper__ - def _gen_self_attn_mask(self, batch_token_ids, batch_tgt_start_idx=None, is_unidirectional=True, shift_len=0): + def _gen_self_attn_mask(self, + batch_token_ids, + batch_tgt_start_idx=None, + is_unidirectional=True, + shift_len=0): max_len = max(map(len, batch_token_ids)) - input_mask_data = np.zeros((len(batch_token_ids), max_len + shift_len, max_len + shift_len)) + input_mask_data = np.zeros( + (len(batch_token_ids), max_len + shift_len, max_len + shift_len)) if is_unidirectional: for index, mask_data in enumerate(input_mask_data): - start = 0 if batch_tgt_start_idx is None else batch_tgt_start_idx[index] + start = 0 if batch_tgt_start_idx is None else batch_tgt_start_idx[ + index] end = len(batch_token_ids[index]) mask_data[:end + shift_len, :start + shift_len] = 1.0 # Generate the lower triangular matrix using the slice of matrix b = np.tril(np.ones([end - start, end - start]), 0) - mask_data[start + shift_len:end + shift_len, start + shift_len:end + shift_len] = b + mask_data[start + shift_len:end + shift_len, + start + shift_len:end + shift_len] = b else: for index, token_ids in enumerate(batch_token_ids): - input_mask_data[index, :len(token_ids) + shift_len, :len(token_ids) + shift_len] = 1.0 + input_mask_data[index, :len(token_ids) + + shift_len, :len(token_ids) + shift_len] = 1.0 return input_mask_data.astype("float32") def _pad_batch_records(self, batch_records, is_infer): @@ -405,20 +456,24 @@ class DialogReader(object): batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id) batch_tgt_start_idx = [record.tgt_start_idx for record in batch_records] - batch["generation_mask"] = self._gen_self_attn_mask(batch_token_ids, batch_tgt_start_idx=batch_tgt_start_idx) + batch["generation_mask"] = self._gen_self_attn_mask( + batch_token_ids, batch_tgt_start_idx=batch_tgt_start_idx) if is_infer: - tgt_ids = np.array([[[self.bos_id]]] * len(batch_token_ids), dtype="int64") + tgt_ids = np.array([[[self.bos_id]]] * len(batch_token_ids), + dtype="int64") if self.continuous_position: tgt_pos = np.array(batch_tgt_start_idx, dtype="int64") else: tgt_pos = np.zeros_like(batch_tgt_start_idx, dtype="int64") tgt_pos = tgt_pos.reshape(-1, 1, 1) - batch["init_score"] = np.zeros_like(tgt_ids, dtype="float32").reshape(-1, 1).tolist() + batch["init_score"] = np.zeros_like( + tgt_ids, dtype="float32").reshape(-1, 1).tolist() batch["tgt_ids"] = tgt_ids.tolist() batch["tgt_pos"] = tgt_pos.tolist() - batch["tgt_generation_mask"] = batch["generation_mask"][:, 0:1, :].astype("float32") + batch["tgt_generation_mask"] = batch[ + "generation_mask"][:, 0:1, :].astype("float32") else: batch["tgt_label"], batch["tgt_pos"] = mask( batch_tokens=batch_token_ids, @@ -427,7 +482,8 @@ class DialogReader(object): is_unidirectional=True) batch_data_id = [record.data_id for record in batch_records] - batch["data_id"] = np.array(batch_data_id).astype("int64").reshape([-1, 1]) + batch["data_id"] = np.array(batch_data_id).astype("int64").reshape( + [-1, 1]) return batch diff --git a/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py b/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py index 8258ed30..b13ce465 100644 --- a/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py +++ b/modules/text/text_generation/plato2_en_large/readers/nsp_reader.py @@ -17,10 +17,10 @@ from collections import namedtuple import numpy as np -from plato2_en_large.readers.dialog_reader import DialogReader -from plato2_en_large.utils import pad_batch_data -from plato2_en_large.utils.args import str2bool -from plato2_en_large.utils.masking import mask +from .dialog_reader import DialogReader +from ..utils import pad_batch_data +from ..utils.args import str2bool +from ..utils.masking import mask class NSPReader(DialogReader): @@ -30,27 +30,35 @@ class NSPReader(DialogReader): def add_cmdline_args(cls, parser): """Add cmdline argurments.""" group = DialogReader.add_cmdline_args(parser) - group.add_argument( - "--attention_style", type=str, default="bidirectional", choices=["bidirectional", "unidirectional"]) - group.add_argument("--mix_negative_sample", type=str2bool, default=False) + group.add_argument("--attention_style", + type=str, + default="bidirectional", + choices=["bidirectional", "unidirectional"]) + group.add_argument("--mix_negative_sample", + type=str2bool, + default=False) return group def __init__(self, args): super(NSPReader, self).__init__(args) self.fields.append("label") - self.Record = namedtuple("Record", self.fields, defaults=(None, ) * len(self.fields)) + self.Record = namedtuple("Record", + self.fields, + defaults=(None, ) * len(self.fields)) self.attention_style = args.attention_style self.mix_negative_sample = args.mix_negative_sample return def _convert_example_to_record(self, example, is_infer): - record = super(NSPReader, self)._convert_example_to_record(example, False) + record = super(NSPReader, + self)._convert_example_to_record(example, False) if "label" in example._fields: record = record._replace(label=int(example.label)) return record def _mix_negative_sample(self, reader, neg_pool_size=2**16): + def gen_from_pool(pool): num_samples = len(pool) if num_samples == 1: @@ -64,10 +72,16 @@ class NSPReader(DialogReader): idx_i = pool[i].tgt_start_idx idx_j = pool[j].tgt_start_idx field_values = {} - field_values["token_ids"] = pool[i].token_ids[:idx_i] + pool[j].token_ids[idx_j:] - field_values["type_ids"] = pool[i].type_ids[:idx_i] + pool[j].type_ids[idx_j:] - field_values["pos_ids"] = list(range(len(field_values["token_ids"]))) - neg_record = self.Record(**field_values, tgt_start_idx=idx_i, data_id=-1, label=0) + field_values["token_ids"] = pool[i].token_ids[:idx_i] + pool[ + j].token_ids[idx_j:] + field_values["type_ids"] = pool[i].type_ids[:idx_i] + pool[ + j].type_ids[idx_j:] + field_values["pos_ids"] = list( + range(len(field_values["token_ids"]))) + neg_record = self.Record(**field_values, + tgt_start_idx=idx_i, + data_id=-1, + label=0) pool.append(neg_record) assert len(neg_record.token_ids) <= self.max_seq_len self.global_rng.shuffle(pool) @@ -88,11 +102,18 @@ class NSPReader(DialogReader): return __wrapper__ - def _batch_reader(self, reader, phase=None, is_infer=False, sort_pool_size=2**16): + def _batch_reader(self, + reader, + phase=None, + is_infer=False, + sort_pool_size=2**16): if self.mix_negative_sample: reader = self._mix_negative_sample(reader) - return super(NSPReader, self)._batch_reader( - reader, phase=phase, is_infer=is_infer, sort_pool_size=sort_pool_size) + return super(NSPReader, + self)._batch_reader(reader, + phase=phase, + is_infer=is_infer, + sort_pool_size=sort_pool_size) def _pad_batch_records(self, batch_records, is_infer): """ @@ -106,8 +127,10 @@ class NSPReader(DialogReader): batch_label = [record.label for record in batch_records] if self.attention_style == "unidirectional": - batch["token_ids"] = pad_batch_data(batch_token_ids, pad_id=self.pad_id) - batch["type_ids"] = pad_batch_data(batch_type_ids, pad_id=self.pad_id) + batch["token_ids"] = pad_batch_data(batch_token_ids, + pad_id=self.pad_id) + batch["type_ids"] = pad_batch_data(batch_type_ids, + pad_id=self.pad_id) batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id) tgt_label, tgt_pos, label_pos = mask( batch_tokens=batch_token_ids, @@ -116,7 +139,8 @@ class NSPReader(DialogReader): sent_b_starts=batch_tgt_start_idx, labels=batch_label, is_unidirectional=True) - attention_mask = self._gen_self_attn_mask(batch_token_ids, batch_tgt_start_idx) + attention_mask = self._gen_self_attn_mask(batch_token_ids, + batch_tgt_start_idx) else: batch_mask_token_ids, tgt_label, tgt_pos, label_pos = mask( batch_tokens=batch_token_ids, @@ -129,10 +153,13 @@ class NSPReader(DialogReader): is_unidirectional=False) if not is_infer: batch_token_ids = batch_mask_token_ids - batch["token_ids"] = pad_batch_data(batch_token_ids, pad_id=self.pad_id) - batch["type_ids"] = pad_batch_data(batch_type_ids, pad_id=self.pad_id) + batch["token_ids"] = pad_batch_data(batch_token_ids, + pad_id=self.pad_id) + batch["type_ids"] = pad_batch_data(batch_type_ids, + pad_id=self.pad_id) batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id) - attention_mask = self._gen_self_attn_mask(batch_token_ids, is_unidirectional=False) + attention_mask = self._gen_self_attn_mask(batch_token_ids, + is_unidirectional=False) batch["attention_mask"] = attention_mask batch["label_pos"] = label_pos @@ -144,5 +171,6 @@ class NSPReader(DialogReader): batch["tgt_pos"] = tgt_pos batch_data_id = [record.data_id for record in batch_records] - batch["data_id"] = np.array(batch_data_id).astype("int64").reshape([-1, 1]) + batch["data_id"] = np.array(batch_data_id).astype("int64").reshape( + [-1, 1]) return batch diff --git a/modules/text/text_generation/plato2_en_large/readers/plato_reader.py b/modules/text/text_generation/plato2_en_large/readers/plato_reader.py index 8502f817..a72b1c89 100644 --- a/modules/text/text_generation/plato2_en_large/readers/plato_reader.py +++ b/modules/text/text_generation/plato2_en_large/readers/plato_reader.py @@ -15,9 +15,9 @@ import numpy as np -from plato2_en_large.readers.dialog_reader import DialogReader -from plato2_en_large.utils import pad_batch_data -from plato2_en_large.utils.masking import mask +from .dialog_reader import DialogReader +from ..utils import pad_batch_data +from ..utils.masking import mask class PlatoReader(DialogReader): @@ -47,9 +47,13 @@ class PlatoReader(DialogReader): batch["pos_ids"] = pad_batch_data(batch_pos_ids, pad_id=self.pad_id) batch["generation_mask"] = self._gen_self_attn_mask( - batch_token_ids, batch_tgt_start_idx=batch_tgt_start_idx, is_unidirectional=True, shift_len=1) + batch_token_ids, + batch_tgt_start_idx=batch_tgt_start_idx, + is_unidirectional=True, + shift_len=1) if not is_infer: - batch["recognition_mask"] = self._gen_self_attn_mask(batch_token_ids, is_unidirectional=False, shift_len=1) + batch["recognition_mask"] = self._gen_self_attn_mask( + batch_token_ids, is_unidirectional=False, shift_len=1) if is_infer: tgt_ids = np.array([[[self.bos_id]]] * batch_size, dtype="int64") @@ -58,20 +62,21 @@ class PlatoReader(DialogReader): else: tgt_pos = np.zeros_like(batch_tgt_start_idx, dtype="int64") tgt_pos = tgt_pos.reshape(-1, 1, 1) - batch["init_score"] = np.zeros_like(tgt_ids, dtype="float32").reshape(-1, 1).tolist() + batch["init_score"] = np.zeros_like( + tgt_ids, dtype="float32").reshape(-1, 1).tolist() batch["tgt_ids"] = tgt_ids.tolist() batch["tgt_pos"] = tgt_pos.tolist() batch["parent_idx"] = np.array(range(batch_size), dtype="int32") - batch["tgt_generation_mask"] = batch["generation_mask"][:, 0:1, :].astype("float32") + batch["tgt_generation_mask"] = batch[ + "generation_mask"][:, 0:1, :].astype("float32") else: - mask_return_list = mask( - batch_tokens=batch_token_ids, - vocab_size=self.vocab_size, - sent_b_starts=batch_tgt_start_idx, - is_unidirectional=True, - use_latent=True, - use_bow=self.use_bow) + mask_return_list = mask(batch_tokens=batch_token_ids, + vocab_size=self.vocab_size, + sent_b_starts=batch_tgt_start_idx, + is_unidirectional=True, + use_latent=True, + use_bow=self.use_bow) batch["tgt_label"] = mask_return_list[0] batch["tgt_pos"] = mask_return_list[1] if self.use_bow: @@ -79,5 +84,6 @@ class PlatoReader(DialogReader): batch["bow_pos"] = mask_return_list[3] batch_data_id = [record.data_id for record in batch_records] - batch["data_id"] = np.array(batch_data_id).astype("int64").reshape([-1, 1]) + batch["data_id"] = np.array(batch_data_id).astype("int64").reshape( + [-1, 1]) return batch diff --git a/modules/text/text_generation/plato2_en_large/tasks/__init__.py b/modules/text/text_generation/plato2_en_large/tasks/__init__.py deleted file mode 100644 index f81c1d18..00000000 --- a/modules/text/text_generation/plato2_en_large/tasks/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Define task.""" - -from .task_base import Task - -TASK_REGISTRY = {} - -__all__ = ["TASK_REGISTRY", "register_task", "create_task", "add_cmdline_args"] - - -def register_task(name): - """ - Register a new task class. - """ - - def __wrapped__(cls): - if name in TASK_REGISTRY: - raise ValueError(f"Cannot register duplicate task ({name})") - if not issubclass(cls, Task): - raise ValueError(f"Task ({name}: {cls.__name__}) must extend Task") - TASK_REGISTRY[name] = cls - return cls - - return __wrapped__ - - -def create_task(args) -> Task: - """ - Create a task. - """ - return TASK_REGISTRY[args.task](args) - - -def add_cmdline_args(parser): - """ - Add cmdline argument of Task. - """ - group = parser.add_argument_group("Task") - group.add_argument("--task", type=str, required=True) - - args, _ = parser.parse_known_args() - if args.task not in TASK_REGISTRY: - raise ValueError(f"Unknown task type: {args.task}") - TASK_REGISTRY[args.task].add_cmdline_args(parser) - return group - - -import plato2_en_large.tasks.dialog_generation -import plato2_en_large.tasks.next_sentence_prediction diff --git a/modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py b/modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py deleted file mode 100644 index 5f425846..00000000 --- a/modules/text/text_generation/plato2_en_large/tasks/dialog_generation.py +++ /dev/null @@ -1,292 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Dialogue generation task.""" - -from collections import defaultdict -import math - -from plato2_en_large.readers.dialog_reader import DialogReader -from plato2_en_large.readers.plato_reader import PlatoReader -from plato2_en_large.tasks import register_task -from plato2_en_large.tasks.task_base import Task -from plato2_en_large.utils.args import str2bool -from plato2_en_large.utils.inference import create_predictor - - -def post_process_context(token_ids, reader, merge=True): - """Post-process the context sequence.""" - context = [] - utt = [] - for tok_id in token_ids[1:]: - if tok_id == reader.eos_id: - utt = reader.tokenizer.convert_ids_to_tokens(utt) - if merge: - utt = reader.tokenizer.merge_subword(utt) - context.append(utt) - utt = [] - else: - utt.append(tok_id) - return context - - -def post_process_response(token_ids, reader, merge=True): - """ - Post-process the decoded sequence. Truncate from the first - and remove the and tokens currently. - """ - eos_pos = len(token_ids) - for i, tok_id in enumerate(token_ids): - if tok_id == reader.eos_id: - eos_pos = i - break - token_ids = token_ids[1:eos_pos] - response = reader.tokenizer.convert_ids_to_tokens(token_ids) - if merge: - response = reader.tokenizer.merge_subword(response) - return token_ids, response - - -def get_cross_turn_repetition(context, pred_tokens, eos_idx, is_cn=False): - """Get cross-turn repetition.""" - if len(pred_tokens) == 0: - return 1.0 - if is_cn: - context = ["".join(utt) for utt in context] - pred_tokens = "".join(pred_tokens) - - pred_tri_grams = set() - for i in range(len(pred_tokens) - 2): - tri_gram = tuple(pred_tokens[i:i + 3]) - pred_tri_grams.add(tri_gram) - for utt in context: - for i in range(len(utt) - 2): - tri_gram = tuple(utt[i:i + 3]) - if tri_gram in pred_tri_grams: - return 1.0 - return 0.0 - - -def get_in_turn_repetition(pred, is_cn=False): - """Get in-turn repetition.""" - if len(pred) == 0: - return 1.0 - if isinstance(pred[0], str): - pred = [tok.lower() for tok in pred] - if is_cn: - pred = "".join(pred) - tri_grams = set() - for i in range(len(pred) - 2): - tri_gram = tuple(pred[i:i + 3]) - if tri_gram in tri_grams: - return 1.0 - tri_grams.add(tri_gram) - return 0.0 - - -def get_nsp_score_batch(nsp_predictor, predictions): - """ - Get NSP scores of a batch. - """ - import argparse - from collections import namedtuple - - from plato2_en_large.readers.nsp_reader import NSPReader - from plato2_en_large.utils.args import parse_args - from plato2_en_large.tasks.next_sentence_prediction import NextSentencePrediction - - parser = argparse.ArgumentParser() - NextSentencePrediction.add_cmdline_args(parser) - parser.add_argument("--num_samples", type=int, default=None) - parser.add_argument("--config_path", type=str, required=True) - parser.add_argument("--mem_efficient", type=str2bool, default=False) - - args = parse_args(parser, allow_unknown=True) - args.load(args.config_path) - if not args.mem_efficient: - if args.num_samples: - args.batch_size *= args.num_samples - if args.latent_type_size: - args.batch_size *= args.latent_type_size - args.tokenized_input = True - reader = NSPReader(args) - - def __reader__(): - headers = ["src", "tgt", "data_id"] - - Example = namedtuple("Example", headers) - - for i, info in enumerate(predictions): - context = post_process_context(info["context_token_ids"], reader, merge=False) - context_tokenized_input = " [SEP] ".join(" ".join(utt) for utt in context) - _, response = post_process_response(info["response_token_ids"], reader, merge=False) - response_tokenized_input = " ".join(response) - example = Example(src=context_tokenized_input, tgt=response_tokenized_input, data_id=i) - record = reader._convert_example_to_record(example, is_infer=True) - yield record - return - - generator = reader.data_generator( - reader=__reader__, - is_infer=True, - phase="test", - ) - - steps = 0 - for data in generator(): - outputs = nsp_predictor(data) - for probs, data_id in zip(outputs[0], outputs[-1]): - data_id = data_id[0] - info = predictions[data_id] - info["nsp_score"] = float(probs[1]) - - return - - -@register_task("DialogGeneration") -class DialogGeneration(Task): - """ - Define dialogue response generation. - """ - - @classmethod - def add_cmdline_args(cls, parser): - """Add cmdline argurments.""" - group = parser.add_argument_group("Task") - group.add_argument("--do_generation", type=str2bool, default=False) - group.add_argument("--is_cn", type=str2bool, default=False) - - group.add_argument("--nsp_inference_model_path", type=str, default=None) - group.add_argument("--nsp_attention_style", type=str, default="bidirectional") - - group.add_argument("--ranking_score", type=str, default="decode_score") - - args, _ = parser.parse_known_args() - if args.model == "Plato": - PlatoReader.add_cmdline_args(parser) - else: - DialogReader.add_cmdline_args(parser) - return group - - def __init__(self, args): - super(DialogGeneration, self).__init__(args) - self.do_generation = args.do_generation - self.is_cn = args.is_cn - if args.model == "Plato": - self.reader = PlatoReader(args) - else: - self.reader = DialogReader(args) - - if args.nsp_inference_model_path: - self.nsp_predictor = create_predictor(args.nsp_inference_model_path, args.is_distributed) - self.nsp_attention_style = args.nsp_attention_style - else: - self.nsp_predictor = None - - self.ranking_score = args.ranking_score - self.max_dec_len = args.max_dec_len - return - - def _post_process_generation_output(self, predictions): - """ - Post process generation output. - - Calculate repetion, reranking. - """ - for info in predictions: - tokens = post_process_context(info["context_token_ids"], self.reader) - pred_token_ids, pred_tokens = post_process_response(info["response_token_ids"], self.reader) - info["context"] = " [SEP] ".join(" ".join(u) for u in tokens) - info["response"] = " ".join(pred_tokens) - info["num_token"] = len(pred_token_ids) - info["cross_turn_repetition"] = get_cross_turn_repetition(tokens, pred_tokens, self.reader.eos_id, - self.is_cn) - info["in_turn_repetition"] = max( - get_in_turn_repetition(pred_tokens, self.is_cn), get_in_turn_repetition(pred_token_ids)) - if self.nsp_predictor is not None: - get_nsp_score_batch(self.nsp_predictor, predictions) - - group = defaultdict(list) - for info in predictions: - group[info["data_id"]].append(info) - - predictions = [] - for data_id in group: - infos = group[data_id] - for info in infos: - info["score"] = info[self.ranking_score] - if self.max_dec_len is not None and info["num_token"] >= self.max_dec_len: # not ending - info["score"] -= 1e3 - elif info["cross_turn_repetition"] > 0: - info["score"] -= 1e3 - elif info["in_turn_repetition"] > 0: - info["score"] -= 1e3 - infos = sorted(infos, key=lambda info: -info["score"]) - pred = infos[0] - keep_attr = ["data_id", "score", "response"] - pred = {k: pred[k] for k in keep_attr} - predictions.append(pred) - return predictions - - def _post_process_scoring_output(self, predictions): - raise NotImplementedError - - def _post_process_infer_output(self, predictions): - if self.do_generation: - return self._post_process_generation_output(predictions) - else: - return self._post_process_scoring_output(predictions) - - def merge_mertrics_and_statistics(self, outputs, part_outputs): - """ - Merge two evaulation output. - """ - if outputs is None: - return part_outputs - - if part_outputs is None: - return outputs - - batch_size = outputs.pop("batch_size") - tokens_num = outputs.pop("tokens_num") - part_batch_size = part_outputs.pop("batch_size") - part_tokens_num = part_outputs.pop("tokens_num") - - new_outputs = {"batch_size": batch_size + part_batch_size, "tokens_num": tokens_num + part_tokens_num} - for k in outputs: - if k.startswith("token_"): - new_outputs[k] = ( - outputs[k] * tokens_num + part_outputs[k] * part_tokens_num) / new_outputs["tokens_num"] - else: - new_outputs[k] = ( - outputs[k] * batch_size + part_outputs[k] * part_batch_size) / new_outputs["batch_size"] - return new_outputs - - def get_metrics(self, outputs): - """ - Get metrics. - """ - if outputs is None: - raise ValueError("metrics is None") - outputs = dict(outputs) - outputs.pop("batch_size", None) - outputs.pop("tokens_num", None) - metrics = {} - for k in outputs: - if k.startswith("token_"): - metrics[k[6:]] = outputs[k] - else: - metrics[k] = outputs[k] - if k == "token_lm_loss": - metrics["ppl"] = math.exp(outputs[k]) - return metrics diff --git a/modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py b/modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py deleted file mode 100644 index 0f0d65d4..00000000 --- a/modules/text/text_generation/plato2_en_large/tasks/next_sentence_prediction.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Next sentence prediction task.""" - -from plato2_en_large.readers.nsp_reader import NSPReader -from plato2_en_large.tasks import register_task -from plato2_en_large.tasks.task_base import Task -from plato2_en_large.utils.args import str2bool - - -@register_task("NextSentencePrediction") -class NextSentencePrediction(Task): - """ - Define dialogue response generation. - """ - - @classmethod - def add_cmdline_args(cls, parser): - """Add cmdline argurments.""" - group = NSPReader.add_cmdline_args(parser) - return group - - def __init__(self, args): - super(NextSentencePrediction, self).__init__(args) - self.reader = NSPReader(args) - return - - def _post_process_infer_output(self, predictions): - predictions = [{ - "data_id": data_id.tolist()[0], - "score": score.tolist()[1] - } for data_id, score in zip(predictions["data_id"], predictions["scores"])] - return predictions diff --git a/modules/text/text_generation/plato2_en_large/tasks/task_base.py b/modules/text/text_generation/plato2_en_large/tasks/task_base.py deleted file mode 100644 index dc4bbe44..00000000 --- a/modules/text/text_generation/plato2_en_large/tasks/task_base.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Task base.""" - -from abc import abstractmethod, ABC - -from plato2_en_large.models.model_base import Model - - -class Task(ABC): - """ - Basic task. - """ - - def __init__(self, args): - return - - def train_step(self, model: Model, inputs): - """Run one training step.""" - outputs = model.train_step(inputs) - outputs = {k: v.tolist()[0] for k, v in outputs.items()} - return outputs - - def eval_step(self, model: Model, inputs): - """Run one evaluation step""" - outputs = model.eval_step(inputs) - outputs = {k: v.tolist()[0] for k, v in outputs.items()} - return outputs - - def infer_step(self, model: Model, inputs): - """Run one inference step.""" - predictions = model.infer_step(inputs) - outputs = self._post_process_infer_output(predictions) - return outputs - - def _post_process_infer_output(self, predictions): - """ - Post-process inference output. - """ - return predictions - - def merge_mertrics_and_statistics(self, outputs, part_outputs): - """ - Merge metrics and statistics. - """ - if outputs is None: - return part_outputs - - if part_outputs is None: - return outputs - - batch_size = outputs.pop("batch_size") - part_batch_size = part_outputs.pop("batch_size") - - new_outputs = { - "batch_size": batch_size + part_batch_size, - } - for k in outputs: - new_outputs[k] = (outputs[k] * batch_size + part_outputs[k] * part_batch_size) / new_outputs["batch_size"] - return new_outputs - - def get_metrics(self, outputs): - """ - Get metrics. - """ - if outputs is None: - raise ValueError("metrics is None") - outputs = dict(outputs) - # pop statistics - outputs.pop("batch_size", None) - return outputs - - def get_data_loader(self, model, *args, is_infer=False, **kwargs): - generator = self.reader.data_generator(*args, is_infer=is_infer, **kwargs) - return model.get_data_loader(generator, is_infer) diff --git a/modules/text/text_generation/plato2_en_large/utils/__init__.py b/modules/text/text_generation/plato2_en_large/utils/__init__.py index 4068cbd6..8865dcbb 100644 --- a/modules/text/text_generation/plato2_en_large/utils/__init__.py +++ b/modules/text/text_generation/plato2_en_large/utils/__init__.py @@ -14,160 +14,45 @@ """Utils.""" from itertools import chain -import os -import time -import sys - import numpy as np -import paddle.fluid as fluid +import paddle -def to_lodtensor(data, place): - """Convert data to LoDTensor.""" - if place is None: - return data - lengths = [] - while isinstance(data[0], list): - lengths.append(list(map(len, data))) - data = [x for xs in data for x in xs] - if isinstance(data[0], float): - data = np.array(data, dtype="float32") +def repeat_array(array, times): + """Repeate numpy array.""" + if isinstance(array, list): + return list(chain(*([array] * times))) else: - data = np.array(data, dtype="int64") - data_tensor = fluid.LoDTensor() - data_tensor.set(data, place) - data_tensor.set_recursive_sequence_lengths(lengths) - return data_tensor + return np.concatenate([array] * times, axis=0) + + +def gen_inputs(inputs, latent_type_size): + batch_size = len(inputs["data_id"]) + new_bsz = batch_size * latent_type_size + inputs = { + name: repeat_array(array, latent_type_size) + for name, array in inputs.items() + } + # Add latent_id + inputs["latent_id"] = np.array( + [i for i in range(latent_type_size) for _ in range(batch_size)], + dtype="int64").reshape([-1, 1]) + + #print('\nplato_inputs:') + for key in inputs: + inputs[key] = paddle.to_tensor(inputs[key]) + if key in [ + 'token_ids', 'type_ids', 'pos_ids', 'tgt_ids', 'tgt_pos', + 'data_id' + ]: + inputs[key] = paddle.squeeze(inputs[key], axis=-1) + #print(key, inputs[key].shape, inputs[key].dtype) + return inputs def pad_batch_data(insts, pad_id=0): """Pad the instances to the max sequence length in batch. """ max_len = max(map(len, insts)) - inst_data = np.array([list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]) + inst_data = np.array( + [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]) return inst_data.astype("int64").reshape([-1, max_len, 1]) - - -def convert_lodtensor_to_list(tensor): - data = np.array(tensor) - recursive_sequence_lengths = tensor.recursive_sequence_lengths() - recursive_sequence_lengths.reverse() - for i, lengths in enumerate(recursive_sequence_lengths): - shift = 0 - new_data = [] - for j, l in enumerate(lengths): - new_data.append(data[shift:shift + l]) - shift += l - data = new_data - return data - - -def concatenate_lodtensors(tensors, place): - """Concatenate LoD tensors.""" - data = [] - recursive_sequence_lengths = [] - for tensor in tensors: - data.append(np.array(tensor)) - recursive_sequence_lengths.append(tensor.recursive_sequence_lengths()) - data = np.concatenate(data, axis=0) - recursive_sequence_lengths = [sum(lens, []) for lens in zip(*recursive_sequence_lengths)] - data_tensor = fluid.LoDTensor() - data_tensor.set(data, place) - data_tensor.set_recursive_sequence_lengths(recursive_sequence_lengths) - assert data_tensor.has_valid_recursive_sequence_lengths() - return data_tensor - - -def repeat_array_or_tensor(array_or_tensor, place, times): - """Repeate numpy array or LoD tensor.""" - if isinstance(array_or_tensor, fluid.LoDTensor): - data = [np.array(array_or_tensor)] * times - recursive_sequence_lengths = [array_or_tensor.recursive_sequence_lengths()] * times - data = np.concatenate(data, axis=0) - recursive_sequence_lengths = [sum(lens, []) for lens in zip(*recursive_sequence_lengths)] - data_tensor = fluid.LoDTensor() - data_tensor.set(data, place) - data_tensor.set_recursive_sequence_lengths(recursive_sequence_lengths) - assert data_tensor.has_valid_recursive_sequence_lengths() - return data_tensor - elif isinstance(array_or_tensor, list): - return list(chain(*([array_or_tensor] * times))) - else: - return np.concatenate([array_or_tensor] * times, axis=0) - - -def slice_array_or_tensor(array_or_tensor, place, begin, end): - """Repeate numpy array or LoD tensor.""" - if isinstance(array_or_tensor, fluid.LoDTensor): - data = convert_lodtensor_to_list(array_or_tensor) - data = data[begin:end] - return to_lodtensor(data, place) - else: - return array_or_tensor[begin:end] - - -def init_checkpoint(exe, init_checkpoint_path, main_program): - """Initialize from checkpoint.""" - assert os.path.exists(init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path - - def existed_persitables(var): - """Whether var is a persistables.""" - if not fluid.io.is_persistable(var): - return False - return os.path.exists(os.path.join(init_checkpoint_path, var.name)) - - fluid.io.load_vars(exe, init_checkpoint_path, main_program=main_program, predicate=existed_persitables) - print(f"Load model from {init_checkpoint_path}") - - -def init_pretraining_params(exe, pretraining_params_path, main_program): - """Only initialize parameters.""" - assert os.path.exists(pretraining_params_path), "[%s] cann't be found." % pretraining_params_path - - def existed_params(var): - """Whether var is a parameter.""" - if not isinstance(var, fluid.framework.Parameter): - return False - return os.path.exists(os.path.join(pretraining_params_path, var.name)) - - fluid.io.load_vars(exe, pretraining_params_path, main_program=main_program, predicate=existed_params) - print(f"Load pretraining parameters from {pretraining_params_path}.") - - return - - -class Timer(object): - def __init__(self): - self._pass_time = 0 - self._start_time = None - return - - def start(self): - self._start_time = time.time() - - def pause(self): - self._pass_time += time.time() - self._start_time - self._start_time = None - - def reset(self): - self._pass_time = 0 - - @property - def pass_time(self): - if self._start_time is None: - return self._pass_time - else: - return self._pass_time + time.time() - self._start_time - - -ERROR_MESSAGE = "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \ - Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n" - - -def check_cuda(use_cuda, err=ERROR_MESSAGE): - """Check CUDA.""" - try: - if use_cuda and not fluid.is_compiled_with_cuda(): - print(err) - sys.exit(1) - except Exception as e: - pass diff --git a/modules/text/text_generation/plato2_en_large/utils/args.py b/modules/text/text_generation/plato2_en_large/utils/args.py index 900c6bf2..514151ec 100644 --- a/modules/text/text_generation/plato2_en_large/utils/args.py +++ b/modules/text/text_generation/plato2_en_large/utils/args.py @@ -15,9 +15,6 @@ import argparse import json -import sys - -import paddle.fluid as fluid def str2bool(v): diff --git a/modules/text/text_generation/plato2_en_large/utils/inference.py b/modules/text/text_generation/plato2_en_large/utils/inference.py deleted file mode 100644 index f21b0162..00000000 --- a/modules/text/text_generation/plato2_en_large/utils/inference.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference utils.""" - -import os - -import paddle.fluid as fluid - - -def create_predictor(inference_model_path, is_distributed=False): - """Create predictor.""" - if is_distributed: - dev_count = fluid.core.get_cuda_device_count() - gpu_id = int(os.getenv("FLAGS_selected_gpus")) - else: - dev_count = 1 - gpu_id = 0 - - place = fluid.CUDAPlace(gpu_id) - exe = fluid.Executor(place) - - scope = fluid.Scope() - with fluid.scope_guard(scope): - inference_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(inference_model_path, exe) - - def __predict__(inputs): - with fluid.scope_guard(scope): - outputs = exe.run(inference_prog, feed=inputs, fetch_list=fetch_targets, return_numpy=True) - return outputs - - return __predict__ diff --git a/modules/text/text_generation/plato2_en_large/utils/masking.py b/modules/text/text_generation/plato2_en_large/utils/masking.py index c9c40bec..9f0eb2d8 100644 --- a/modules/text/text_generation/plato2_en_large/utils/masking.py +++ b/modules/text/text_generation/plato2_en_large/utils/masking.py @@ -15,8 +15,6 @@ import numpy as np -import plato2_en_large.utils - def mask(batch_tokens, vocab_size, @@ -47,14 +45,20 @@ def mask(batch_tokens, else: shift_len = 0 for sent_index, sent in enumerate(batch_tokens): - sent_b_index = sent_b_starts[sent_index] if sent_b_starts is not None else 0 + sent_b_index = sent_b_starts[ + sent_index] if sent_b_starts is not None else 0 need_cal = True if labels is not None: - label_pos.append(sent_index * max_len + len(sent) - 1 + shift_len) + label_pos.append(sent_index * max_len + len(sent) - 1 + + shift_len) if labels[sent_index] == 0: need_cal = False mask_label.extend(sent[sent_b_index + 1:]) - mask_pos.extend([sent_index * max_len + i + shift_len for i in range(sent_b_index, len(sent) - 1)]) + mask_pos.extend([ + sent_index * max_len + i + shift_len + for i in range(sent_b_index, + len(sent) - 1) + ]) mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) return_list = [mask_label, mask_pos] @@ -64,14 +68,21 @@ def mask(batch_tokens, bow_label = [] bow_pos = [] for sent_index, sent in enumerate(batch_tokens): - sent_b_index = sent_b_starts[sent_index] if sent_b_starts is not None else 0 + sent_b_index = sent_b_starts[ + sent_index] if sent_b_starts is not None else 0 def __filter__(tok_id): # TODO: exclude [EOS] from bow loss return True - bow_pos.extend([sent_index for i in range(sent_b_index + 1, len(sent)) if __filter__(sent[i])]) - bow_label.extend([sent[i] for i in range(sent_b_index + 1, len(sent)) if __filter__(sent[i])]) + bow_pos.extend([ + sent_index for i in range(sent_b_index + 1, len(sent)) + if __filter__(sent[i]) + ]) + bow_label.extend([ + sent[i] for i in range(sent_b_index + 1, len(sent)) + if __filter__(sent[i]) + ]) bow_label = np.array(bow_label).astype("int64").reshape([-1, 1]) bow_pos = np.array(bow_pos).astype("int64").reshape([-1, 1]) return_list += [bow_label, bow_pos] @@ -80,7 +91,9 @@ def mask(batch_tokens, total_token_num = sum(map(len, batch_tokens)) prob_mask = np.random.rand(total_token_num) # TODO: fix replace_ids, include [UNK] - replace_ids = np.random.randint(3, high=vocab_size, size=total_token_num) + replace_ids = np.random.randint(3, + high=vocab_size, + size=total_token_num) prob_index = 0 for sent_index, sent in enumerate(batch_tokens): # add pair label position diff --git a/modules/text/text_generation/plato2_en_large/utils/tokenization.py b/modules/text/text_generation/plato2_en_large/utils/tokenization.py index 216ef86b..d2ad33a0 100644 --- a/modules/text/text_generation/plato2_en_large/utils/tokenization.py +++ b/modules/text/text_generation/plato2_en_large/utils/tokenization.py @@ -14,14 +14,10 @@ """Tokenization classes.""" import collections -import json import sentencepiece as spm -import six import unicodedata -from plato2_en_large.utils.args import str2bool - -SPIECE_UNDERLINE = u"▁".encode("utf-8") +from .args import str2bool def clean_text(text): @@ -79,15 +75,18 @@ def encode_ids(spm_model, text, sample=False): def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if isinstance(text, six.binary_type): + if isinstance(text, str): + return text + elif isinstance(text, bytes): return text.decode("utf-8", "ignore") - return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = open(vocab_file) + fin = open(vocab_file, 'r', encoding="UTF-8") for num, line in enumerate(fin): items = convert_to_unicode(line.rstrip()).split("\t") if len(items) > 2: -- GitLab