diff --git a/PaddleNLP/PaddleMT/transformer/README.md b/PaddleNLP/PaddleMT/transformer/README.md index 2d5e31a6ee16bda2b043d19b49371091596f3a1f..90d47f53cc4566bc5428d95e24ee43641e27c90b 100644 --- a/PaddleNLP/PaddleMT/transformer/README.md +++ b/PaddleNLP/PaddleMT/transformer/README.md @@ -39,7 +39,7 @@ 克隆代码库到本地 ```shell git clone https://github.com/PaddlePaddle/models.git - cd models/PaddleNLP/neural_machine_translation/transformer + cd models/PaddleNLP/PaddleMT/transformer ``` 3. 环境依赖 diff --git a/PaddleNLP/PaddleMT/transformer/inference_model.py b/PaddleNLP/PaddleMT/transformer/inference_model.py index d1b88f5be965f85eac1bc703ee10f7cca57bcb78..40fc7edeb229d3eb1cfbf4f6c4911b3716291efa 100644 --- a/PaddleNLP/PaddleMT/transformer/inference_model.py +++ b/PaddleNLP/PaddleMT/transformer/inference_model.py @@ -22,9 +22,8 @@ import numpy as np import paddle import paddle.fluid as fluid -#include palm for easier nlp coding -from palm.toolkit.input_field import InputField -from palm.toolkit.configure import PDConfig +from utils.input_field import InputField +from utils.configure import PDConfig # include task-specific libs import desc diff --git a/PaddleNLP/PaddleMT/transformer/main.py b/PaddleNLP/PaddleMT/transformer/main.py index 6ff929af0e72296bc635a56d90d2c0925b5bad68..feaf29baeb386b7843651ff9fc4197861d702c66 100644 --- a/PaddleNLP/PaddleMT/transformer/main.py +++ b/PaddleNLP/PaddleMT/transformer/main.py @@ -20,13 +20,12 @@ import numpy as np import paddle import paddle.fluid as fluid -#include palm for easier nlp coding -from palm.toolkit.configure import PDConfig - +from utils.configure import PDConfig from train import do_train from predict import do_predict from inference_model import do_save_inference_model + if __name__ == "__main__": LOG_FORMAT = "[%(asctime)s %(levelname)s %(filename)s:%(lineno)d] %(message)s" logging.basicConfig( diff --git a/PaddleNLP/PaddleMT/transformer/transformer.py b/PaddleNLP/PaddleMT/transformer/transformer.py index 3e2367c1156b06dad7daa5d1409139c3c01c7c2e..be20001b25fdb94fcc4bc234bae220413ddfacdd 100644 --- a/PaddleNLP/PaddleMT/transformer/transformer.py +++ b/PaddleNLP/PaddleMT/transformer/transformer.py @@ -301,16 +301,15 @@ def prepare_encoder_decoder(src_word, src_word, size=[src_vocab_size, src_emb_dim], padding_idx=bos_idx, # set embedding of bos to 0 - param_attr=fluid.ParamAttr( - name=word_emb_param_name, - initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) + param_attr=fluid.ParamAttr(name=word_emb_param_name, + initializer=fluid.initializer.Normal( + 0., src_emb_dim**-0.5))) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) - src_pos_enc = fluid.embedding( - src_pos, - size=[src_max_len, src_emb_dim], - param_attr=fluid.ParamAttr( - name=pos_enc_param_name, trainable=False)) + src_pos_enc = fluid.embedding(src_pos, + size=[src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, trainable=False)) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc return layers.dropout( @@ -537,51 +536,48 @@ def transformer(model_input, label = model_input.lbl_word weights = model_input.lbl_weight - enc_output = wrap_encoder( - enc_inputs, - src_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - bos_idx=bos_idx) - - predict = wrap_decoder( - dec_inputs, - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_output=enc_output) + enc_output = wrap_encoder(enc_inputs, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=bos_idx) + + predict = wrap_decoder(dec_inputs, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_output=enc_output) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if label_smooth_eps: # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing # the enforcement that the last dimension of label must be 1. - label = layers.label_smooth( - label=layers.one_hot( - input=label, depth=trg_vocab_size), - epsilon=label_smooth_eps) + label = layers.label_smooth(label=layers.one_hot(input=label, + depth=trg_vocab_size), + epsilon=label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, @@ -726,23 +722,22 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len, dec_inputs = (model_input.trg_word, model_input.init_score, model_input.init_idx, model_input.trg_src_attn_bias) - enc_output = wrap_encoder( - enc_inputs, - src_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - bos_idx=bos_idx) + enc_output = wrap_encoder(enc_inputs, + src_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=bos_idx) start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs def beam_search(): @@ -801,26 +796,25 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len, dtype=pre_ids.dtype), y=step_idx, axis=0) - logits = wrap_decoder( - (pre_ids, pre_pos, None, pre_src_attn_bias), - trg_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_output=enc_output, - caches=caches, - gather_idx=parent_idx, - bos_idx=bos_idx) + logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias), + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_output=enc_output, + caches=caches, + gather_idx=parent_idx, + bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) diff --git a/PaddleNLP/neural_machine_translation/transformer/.run_ce.sh b/PaddleNLP/neural_machine_translation/transformer/.run_ce.sh deleted file mode 100755 index 357cb7386114eba8266d1240676c4599aff2de01..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/.run_ce.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -DATA_PATH=./dataset/wmt16 - -train(){ - python -u main.py \ - --do_train True \ - --src_vocab_fpath $DATA_PATH/en_10000.dict \ - --trg_vocab_fpath $DATA_PATH/de_10000.dict \ - --special_token '' '' '' \ - --training_file $DATA_PATH/wmt16/train \ - --use_token_batch True \ - --batch_size 2048 \ - --sort_type pool \ - --pool_size 10000 \ - --print_step 1 \ - --weight_sharing False \ - --epoch 20 \ - --enable_ce True \ - --random_seed 1000 \ - --save_checkpoint "" \ - --save_param "" -} - -cudaid=${transformer:=0} # use 0-th card as default -export CUDA_VISIBLE_DEVICES=$cudaid - -train | python _ce.py - -cudaid=${transformer_m:=0,1,2,3} # use 0,1,2,3 card as default -export CUDA_VISIBLE_DEVICES=$cudaid - -train | python _ce.py \ No newline at end of file diff --git a/PaddleNLP/neural_machine_translation/transformer/README.md b/PaddleNLP/neural_machine_translation/transformer/README.md deleted file mode 100644 index 2d5e31a6ee16bda2b043d19b49371091596f3a1f..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/README.md +++ /dev/null @@ -1,270 +0,0 @@ -## Transformer - -以下是本例的简要目录结构及说明: - -```text -. -├── images # README 文档中的图片 -├── utils # 工具包 -├── desc.py # 输入描述文件 -├── gen_data.sh # 数据生成脚本 -├── inference_model.py # 保存 inference_model 的脚本 -├── main.py # 主程序入口 -├── predict.py # 预测脚本 -├── reader.py # 数据读取接口 -├── README.md # 文档 -├── train.py # 训练脚本 -├── transformer.py # 模型定义文件 -└── transformer.yaml # 配置文件 -``` - -## 模型简介 - -机器翻译(machine translation, MT)是利用计算机将一种自然语言(源语言)转换为另一种自然语言(目标语言)的过程,输入为源语言句子,输出为相应的目标语言的句子。 - -本项目是机器翻译领域主流模型 Transformer 的 PaddlePaddle 实现, 包含模型训练,预测以及使用自定义数据等内容。用户可以基于发布的内容搭建自己的翻译模型。 - -同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122281) - -## 快速开始 - -### 安装说明 - -1. paddle安装 - - 本项目依赖于 PaddlePaddle 1.6及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 - -2. 下载代码 - - 克隆代码库到本地 - ```shell - git clone https://github.com/PaddlePaddle/models.git - cd models/PaddleNLP/neural_machine_translation/transformer - ``` - -3. 环境依赖 - - 请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)部分的内容 - - -### 数据准备 - -公开数据集:WMT 翻译大赛是机器翻译领域最具权威的国际评测大赛,其中英德翻译任务提供了一个中等规模的数据集,这个数据集是较多论文中使用的数据集,也是 Transformer 论文中用到的一个数据集。我们也将[WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)作为示例提供。运行 `gen_data.sh` 脚本进行 WMT'16 EN-DE 数据集的下载和预处理(时间较长,建议后台运行)。数据处理过程主要包括 Tokenize 和 [BPE 编码(byte-pair encoding)](https://arxiv.org/pdf/1508.07909)。运行成功后,将会生成文件夹 `gen_data`,其目录结构如下: - -```text -. -├── wmt16_ende_data # WMT16 英德翻译数据 -├── wmt16_ende_data_bpe # BPE 编码的 WMT16 英德翻译数据 -├── mosesdecoder # Moses 机器翻译工具集,包含了 Tokenize、BLEU 评估等脚本 -└── subword-nmt # BPE 编码的代码 -``` - -另外我们也整理提供了一份处理好的 WMT'16 EN-DE 数据以供[下载](https://transformer-res.bj.bcebos.com/wmt16_ende_data_bpe_clean.tar.gz)使用,其中包含词典(`vocab_all.bpe.32000`文件)、训练所需的 BPE 数据(`train.tok.clean.bpe.32000.en-de`文件)、预测所需的 BPE 数据(`newstest2016.tok.bpe.32000.en-de`等文件)和相应的评估预测结果所需的 tokenize 数据(`newstest2016.tok.de`等文件)。 - - -自定义数据:如果需要使用自定义数据,本项目程序中可直接支持的数据格式为制表符 \t 分隔的源语言和目标语言句子对,句子中的 token 之间使用空格分隔。提供以上格式的数据文件(可以分多个part,数据读取支持文件通配符)和相应的词典文件即可直接运行。 - -### 单机训练 - -以提供的英德翻译数据为例,可以执行以下命令进行模型训练: - -```sh -# open garbage collection to save memory -export FLAGS_eager_delete_tensor_gb=0.0 -# setting visible devices for training -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 - -python -u main.py \ - --do_train True \ - --epoch 30 \ - --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --special_token '' '' '' \ - --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \ - --batch_size 4096 -``` - -以上命令中传入了执行训练(`do_train`)、训练轮数(`epoch`)和训练数据文件路径(注意请正确设置,支持通配符)等参数,更多参数的使用以及支持的模型超参数可以参见 `transformer.yaml` 配置文件,其中默认提供了 Transformer base model 的配置,如需调整可以在配置文件中更改或通过命令行传入(命令行传入内容将覆盖配置文件中的设置)。可以通过以下命令来训练 Transformer 论文中的 big model: - -```sh -# open garbage collection to save memory -export FLAGS_eager_delete_tensor_gb=0.0 -# setting visible devices for training -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 - -python -u main.py \ - --do_train True \ - --epoch 30 \ - --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --special_token '' '' '' \ - --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \ - --batch_size 4096 \ - --n_head 16 \ - --d_model 1024 \ - --d_inner_hid 4096 \ - --prepostprocess_dropout 0.3 -``` - -训练时默认使用所有 GPU,可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--use_cuda False` 设置),训练速度相对较慢。在执行训练时若提供了 `save_param` 和 `save_checkpoint`(默认为 trained_params 和 trained_ckpts),则每隔一定 iteration 后(通过参数 `save_step` 设置,默认为10000)将分别保存当前训练的参数值和 checkpoint 到相应目录,每隔一定数目的 iteration (通过参数 `print_step` 设置,默认为100)将打印如下的日志到标准输出: - -```txt -[2019-08-02 15:30:51,656 INFO train.py:262] step_idx: 150100, epoch: 32, batch: 1364, avg loss: 2.880427, normalized loss: 1.504687, ppl: 17.821888, speed: 3.34 step/s -[2019-08-02 15:31:19,824 INFO train.py:262] step_idx: 150200, epoch: 32, batch: 1464, avg loss: 2.955965, normalized loss: 1.580225, ppl: 19.220257, speed: 3.55 step/s -[2019-08-02 15:31:48,151 INFO train.py:262] step_idx: 150300, epoch: 32, batch: 1564, avg loss: 2.951180, normalized loss: 1.575439, ppl: 19.128502, speed: 3.53 step/s -[2019-08-02 15:32:16,401 INFO train.py:262] step_idx: 150400, epoch: 32, batch: 1664, avg loss: 3.027281, normalized loss: 1.651540, ppl: 20.641024, speed: 3.54 step/s -[2019-08-02 15:32:44,764 INFO train.py:262] step_idx: 150500, epoch: 32, batch: 1764, avg loss: 3.069125, normalized loss: 1.693385, ppl: 21.523066, speed: 3.53 step/s -[2019-08-02 15:33:13,199 INFO train.py:262] step_idx: 150600, epoch: 32, batch: 1864, avg loss: 2.869379, normalized loss: 1.493639, ppl: 17.626074, speed: 3.52 step/s -[2019-08-02 15:33:41,601 INFO train.py:262] step_idx: 150700, epoch: 32, batch: 1964, avg loss: 2.980905, normalized loss: 1.605164, ppl: 19.705633, speed: 3.52 step/s -[2019-08-02 15:34:10,079 INFO train.py:262] step_idx: 150800, epoch: 32, batch: 2064, avg loss: 3.047716, normalized loss: 1.671976, ppl: 21.067181, speed: 3.51 step/s -[2019-08-02 15:34:38,598 INFO train.py:262] step_idx: 150900, epoch: 32, batch: 2164, avg loss: 2.956475, normalized loss: 1.580735, ppl: 19.230072, speed: 3.51 step/s -``` - -### 模型推断 - -以英德翻译数据为例,模型训练完成后可以执行以下命令对指定文件中的文本进行翻译: - -```sh -# open garbage collection to save memory -export FLAGS_eager_delete_tensor_gb=0.0 -# setting visible devices for prediction -export CUDA_VISIBLE_DEVICES=0 - -python -u main.py \ - --do_predict True \ - --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --special_token '' '' '' \ - --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ - --batch_size 32 \ - --init_from_params trained_params/step_100000 \ - --beam_size 5 \ - --max_out_len 255 \ - --output_file predict.txt -``` - - 由 `predict_file` 指定的文件中文本的翻译结果会输出到 `output_file` 指定的文件。执行预测时需要设置 `init_from_params` 来给出模型所在目录,更多参数的使用可以在 `transformer.yaml` 文件中查阅注释说明并进行更改设置。注意若在执行预测时设置了模型超参数,应与模型训练时的设置一致,如若训练时使用 big model 的参数设置,则预测时对应类似如下命令: - -```sh -# open garbage collection to save memory -export FLAGS_eager_delete_tensor_gb=0.0 -# setting visible devices for prediction -export CUDA_VISIBLE_DEVICES=0 - -python -u main.py \ - --do_predict True \ - --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ - --special_token '' '' '' \ - --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ - --batch_size 32 \ - --init_from_params trained_params/step_100000 \ - --beam_size 5 \ - --max_out_len 255 \ - --output_file predict.txt \ - --n_head 16 \ - --d_model 1024 \ - --d_inner_hid 4096 \ - --prepostprocess_dropout 0.3 -``` - - -### 模型评估 - -预测结果中每行输出是对应行输入的得分最高的翻译,对于使用 BPE 的数据,预测出的翻译结果也将是 BPE 表示的数据,要还原成原始的数据(这里指 tokenize 后的数据)才能进行正确的评估。评估过程具体如下(BLEU 是翻译任务常用的自动评估方法指标): - -```sh -# 还原 predict.txt 中的预测结果为 tokenize 后的数据 -sed -r 's/(@@ )|(@@ ?$)//g' predict.txt > predict.tok.txt -# 若无 BLEU 评估工具,需先进行下载 -# git clone https://github.com/moses-smt/mosesdecoder.git -# 以英德翻译 newstest2014 测试数据为例 -perl gen_data/mosesdecoder/scripts/generic/multi-bleu.perl gen_data/wmt16_ende_data/newstest2014.tok.de < predict.tok.txt -``` -可以看到类似如下的结果: -``` -BLEU = 26.35, 57.7/32.1/20.0/13.0 (BP=1.000, ratio=1.013, hyp_len=63903, ref_len=63078) -``` - -使用本项目中提供的内容,英德翻译 base model 和 big model 八卡训练 100K 个 iteration 后测试有大约如下的 BLEU 值: - -| 测试集 | newstest2014 | newstest2015 | newstest2016 | -|-|-|-|-| -| Base | 26.35 | 29.07 | 33.30 | -| Big | 27.07 | 30.09 | 34.38 | - -### 预训练模型 - -我们这里提供了对应有以上 BLEU 值的 [base model](https://transformer-res.bj.bcebos.com/base_model_params.tar.gz) 和 [big model](https://transformer-res.bj.bcebos.com/big_model_params.tar.gz) 的模型参数提供下载使用(注意,模型使用了提供下载的数据进行训练和测试)。 - -## 进阶使用 - -### 背景介绍 - -Transformer 是论文 [Attention Is All You Need](https://arxiv.org/abs/1706.03762) 中提出的用以完成机器翻译(machine translation, MT)等序列到序列(sequence to sequence, Seq2Seq)学习任务的一种全新网络结构,其完全使用注意力(Attention)机制来实现序列到序列的建模[1]。 - -相较于此前 Seq2Seq 模型中广泛使用的循环神经网络(Recurrent Neural Network, RNN),使用(Self)Attention 进行输入序列到输出序列的变换主要具有以下优势: - -- 计算复杂度小 - - 特征维度为 d 、长度为 n 的序列,在 RNN 中计算复杂度为 `O(n * d * d)` (n 个时间步,每个时间步计算 d 维的矩阵向量乘法),在 Self-Attention 中计算复杂度为 `O(n * n * d)` (n 个时间步两两计算 d 维的向量点积或其他相关度函数),n 通常要小于 d 。 -- 计算并行度高 - - RNN 中当前时间步的计算要依赖前一个时间步的计算结果;Self-Attention 中各时间步的计算只依赖输入不依赖之前时间步输出,各时间步可以完全并行。 -- 容易学习长程依赖(long-range dependencies) - - RNN 中相距为 n 的两个位置间的关联需要 n 步才能建立;Self-Attention 中任何两个位置都直接相连;路径越短信号传播越容易。 - -Transformer 中引入使用的基于 Self-Attention 的序列建模模块结构,已被广泛应用在 Bert [2]等语义表示模型中,取得了显著效果。 - - -### 模型概览 - -Transformer 同样使用了 Seq2Seq 模型中典型的编码器-解码器(Encoder-Decoder)的框架结构,整体网络结构如图1所示。 - -

-
-图 1. Transformer 网络结构图 -

- -可以看到,和以往 Seq2Seq 模型不同,Transformer 的 Encoder 和 Decoder 中不再使用 RNN 的结构。 - -### 模型特点 - -Transformer 中的 Encoder 由若干相同的 layer 堆叠组成,每个 layer 主要由多头注意力(Multi-Head Attention)和全连接的前馈(Feed-Forward)网络这两个 sub-layer 构成。 -- Multi-Head Attention 在这里用于实现 Self-Attention,相比于简单的 Attention 机制,其将输入进行多路线性变换后分别计算 Attention 的结果,并将所有结果拼接后再次进行线性变换作为输出。参见图2,其中 Attention 使用的是点积(Dot-Product),并在点积后进行了 scale 的处理以避免因点积结果过大进入 softmax 的饱和区域。 -- Feed-Forward 网络会对序列中的每个位置进行相同的计算(Position-wise),其采用的是两次线性变换中间加以 ReLU 激活的结构。 - -此外,每个 sub-layer 后还施以 Residual Connection [3]和 Layer Normalization [4]来促进梯度传播和模型收敛。 - -

-
-图 2. Multi-Head Attention -

- -Decoder 具有和 Encoder 类似的结构,只是相比于组成 Encoder 的 layer ,在组成 Decoder 的 layer 中还多了一个 Multi-Head Attention 的 sub-layer 来实现对 Encoder 输出的 Attention,这个 Encoder-Decoder Attention 在其他 Seq2Seq 模型中也是存在的。 - -## FAQ - -**Q:** 预测结果中样本数少于输入的样本数是什么原因 -**A:** 若样本中最大长度超过 `transformer.yaml` 中 `max_length` 的默认设置,请注意运行时增大 `--max_length` 的设置,否则超长样本将被过滤。 - -**Q:** 预测时最大长度超过了训练时的最大长度怎么办 -**A:** 由于训练时 `max_length` 的设置决定了保存模型 position encoding 的大小,若预测时长度超过 `max_length`,请调大该值,会重新生成更大的 position encoding 表。 - - -## 参考文献 -1. Vaswani A, Shazeer N, Parmar N, et al. [Attention is all you need](http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)[C]//Advances in Neural Information Processing Systems. 2017: 6000-6010. -2. Devlin J, Chang M W, Lee K, et al. [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805)[J]. arXiv preprint arXiv:1810.04805, 2018. -3. He K, Zhang X, Ren S, et al. [Deep residual learning for image recognition](http://openaccess.thecvf.com/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf)[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 770-778. -4. Ba J L, Kiros J R, Hinton G E. [Layer normalization](https://arxiv.org/pdf/1607.06450.pdf)[J]. arXiv preprint arXiv:1607.06450, 2016. -5. Sennrich R, Haddow B, Birch A. [Neural machine translation of rare words with subword units](https://arxiv.org/pdf/1508.07909)[J]. arXiv preprint arXiv:1508.07909, 2015. - - -## 版本更新 - -2019/08/16 进行了规范化,更新了 Paddle 接口的使用 - -## 作者 -- [guochengCS](https://github.com/guoshengCS) - -## 如何贡献代码 - -如果你可以修复某个issue或者增加一个新功能,欢迎给我们提交PR。如果对应的PR被接受了,我们将根据贡献的质量和难度进行打分(0-5分,越高越好)。如果你累计获得了10分,可以联系我们获得面试机会或者为你写推荐信。 diff --git a/PaddleNLP/neural_machine_translation/transformer/__init__.py b/PaddleNLP/neural_machine_translation/transformer/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/PaddleNLP/neural_machine_translation/transformer/_ce.py b/PaddleNLP/neural_machine_translation/transformer/_ce.py deleted file mode 100644 index c619b51377a446bed2872480d6e39dd6a61507ab..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/_ce.py +++ /dev/null @@ -1,68 +0,0 @@ -####this file is only used for continuous evaluation test! - -import os -import sys -sys.path.insert(0, os.environ['ceroot']) -from kpi import CostKpi, DurationKpi, AccKpi - -#### NOTE kpi.py should shared in models in some way!!!! - -train_cost_card1_kpi = CostKpi('train_cost_card1', 0.002, 0, actived=True) -# test_cost_card1_kpi = CostKpi('test_cost_card1', 0.008, 0, actived=True) -train_duration_card1_kpi = DurationKpi( - 'train_duration_card1', 0.006, 0, actived=True) -train_cost_card4_kpi = CostKpi('train_cost_card4', 0.001, 0, actived=True) -# test_cost_card4_kpi = CostKpi('test_cost_card4', 0.001, 0, actived=True) -train_duration_card4_kpi = DurationKpi( - 'train_duration_card4', 0.02, 0, actived=True) - -tracking_kpis = [ - train_cost_card1_kpi, - # test_cost_card1_kpi, - train_duration_card1_kpi, - train_cost_card4_kpi, - # test_cost_card4_kpi, - train_duration_card4_kpi, -] - - -def parse_log(log): - ''' - This method should be implemented by model developers. - The suggestion: - each line in the log should be key, value, for example: - " - train_cost\t1.0 - test_cost\t1.0 - train_cost\t1.0 - train_cost\t1.0 - train_acc\t1.2 - " - ''' - for line in log.split('\n'): - fs = line.strip().split('\t') - print(fs) - if len(fs) == 3 and fs[0] == 'kpis': - print("-----%s" % fs) - kpi_name = fs[1] - kpi_value = float(fs[2]) - yield kpi_name, kpi_value - - -def log_to_ce(log): - kpi_tracker = {} - for kpi in tracking_kpis: - kpi_tracker[kpi.name] = kpi - - for (kpi_name, kpi_value) in parse_log(log): - print(kpi_name, kpi_value) - kpi_tracker[kpi_name].add_record(kpi_value) - kpi_tracker[kpi_name].persist() - - -if __name__ == '__main__': - log = sys.stdin.read() - print("*****") - print(log) - print("****") - log_to_ce(log) diff --git a/PaddleNLP/neural_machine_translation/transformer/desc.py b/PaddleNLP/neural_machine_translation/transformer/desc.py deleted file mode 100644 index d6c34191cd5f182b17eaabbce29c811985e97703..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/desc.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The placeholder for batch_size in compile time. Must be -1 currently to be -# consistent with some ops' infer-shape output in compile time, such as the -# sequence_expand op used in beamsearch decoder. -batch_size = None -# The placeholder for squence length in compile time. -seq_len = None -# The placeholder for head number in compile time. -n_head = 8 -# The placeholder for model dim in compile time. -d_model = 512 -# Here list the data shapes and data types of all inputs. -# The shapes here act as placeholder and are set to pass the infer-shape in -# compile time. -input_descs = { - # The actual data shape of src_word is: - # [batch_size, max_src_len_in_batch] - "src_word": [(batch_size, seq_len), "int64", 2], - # The actual data shape of src_pos is: - # [batch_size, max_src_len_in_batch, 1] - "src_pos": [(batch_size, seq_len), "int64"], - # This input is used to remove attention weights on paddings in the - # encoder. - # The actual data shape of src_slf_attn_bias is: - # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch] - "src_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"], - # The actual data shape of trg_word is: - # [batch_size, max_trg_len_in_batch, 1] - "trg_word": [(batch_size, seq_len), "int64", - 2], # lod_level is only used in fast decoder. - # The actual data shape of trg_pos is: - # [batch_size, max_trg_len_in_batch, 1] - "trg_pos": [(batch_size, seq_len), "int64"], - # This input is used to remove attention weights on paddings and - # subsequent words in the decoder. - # The actual data shape of trg_slf_attn_bias is: - # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch] - "trg_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"], - # This input is used to remove attention weights on paddings of the source - # input in the encoder-decoder attention. - # The actual data shape of trg_src_attn_bias is: - # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch] - "trg_src_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"], - # This input is used in independent decoder program for inference. - # The actual data shape of enc_output is: - # [batch_size, max_src_len_in_batch, d_model] - "enc_output": [(batch_size, seq_len, d_model), "float32"], - # The actual data shape of label_word is: - # [batch_size * max_trg_len_in_batch, 1] - "lbl_word": [(None, 1), "int64"], - # This input is used to mask out the loss of paddding tokens. - # The actual data shape of label_weight is: - # [batch_size * max_trg_len_in_batch, 1] - "lbl_weight": [(None, 1), "float32"], - # This input is used in beam-search decoder. - "init_score": [(batch_size, 1), "float32", 2], - # This input is used in beam-search decoder for the first gather - # (cell states updation) - "init_idx": [(batch_size, ), "int32"], -} - -# Names of word embedding table which might be reused for weight sharing. -word_emb_param_names = ( - "src_word_emb_table", - "trg_word_emb_table", ) -# Names of position encoding table which will be initialized externally. -pos_enc_param_names = ( - "src_pos_enc_table", - "trg_pos_enc_table", ) -# separated inputs for different usages. -encoder_data_input_fields = ( - "src_word", - "src_pos", - "src_slf_attn_bias", ) -decoder_data_input_fields = ( - "trg_word", - "trg_pos", - "trg_slf_attn_bias", - "trg_src_attn_bias", - "enc_output", ) -label_data_input_fields = ( - "lbl_word", - "lbl_weight", ) -# In fast decoder, trg_pos (only containing the current time step) is generated -# by ops and trg_slf_attn_bias is not needed. -fast_decoder_data_input_fields = ( - "trg_word", - "init_score", - "init_idx", - "trg_src_attn_bias", ) diff --git a/PaddleNLP/neural_machine_translation/transformer/gen_data.sh b/PaddleNLP/neural_machine_translation/transformer/gen_data.sh deleted file mode 100644 index e00ae05d9c5cc59b7b401428f6e1252397debfe9..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/gen_data.sh +++ /dev/null @@ -1,220 +0,0 @@ -#! /usr/bin/env bash - -set -e - -OUTPUT_DIR=$PWD/gen_data - -############################################################################### -# change these variables for other WMT data -############################################################################### -OUTPUT_DIR_DATA="${OUTPUT_DIR}/wmt16_ende_data" -OUTPUT_DIR_BPE_DATA="${OUTPUT_DIR}/wmt16_ende_data_bpe" -LANG1="en" -LANG2="de" -# each of TRAIN_DATA: data_url data_file_lang1 data_file_lang2 -TRAIN_DATA=( -'http://www.statmt.org/europarl/v7/de-en.tgz' -'europarl-v7.de-en.en' 'europarl-v7.de-en.de' -'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz' -'commoncrawl.de-en.en' 'commoncrawl.de-en.de' -'http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz' -'news-commentary-v11.de-en.en' 'news-commentary-v11.de-en.de' -) -# each of DEV_TEST_DATA: data_url data_file_lang1 data_file_lang2 -DEV_TEST_DATA=( -'http://data.statmt.org/wmt16/translation-task/dev.tgz' -'newstest201[45]-deen-ref.en.sgm' 'newstest201[45]-deen-src.de.sgm' -'http://data.statmt.org/wmt16/translation-task/test.tgz' -'newstest2016-deen-ref.en.sgm' 'newstest2016-deen-src.de.sgm' -) -############################################################################### - -############################################################################### -# change these variables for other WMT data -############################################################################### -# OUTPUT_DIR_DATA="${OUTPUT_DIR}/wmt14_enfr_data" -# OUTPUT_DIR_BPE_DATA="${OUTPUT_DIR}/wmt14_enfr_data_bpe" -# LANG1="en" -# LANG2="fr" -# # each of TRAIN_DATA: ata_url data_tgz data_file -# TRAIN_DATA=( -# 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz' -# 'commoncrawl.fr-en.en' 'commoncrawl.fr-en.fr' -# 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz' -# 'training/europarl-v7.fr-en.en' 'training/europarl-v7.fr-en.fr' -# 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz' -# 'training/news-commentary-v9.fr-en.en' 'training/news-commentary-v9.fr-en.fr' -# 'http://www.statmt.org/wmt10/training-giga-fren.tar' -# 'giga-fren.release2.fixed.en.*' 'giga-fren.release2.fixed.fr.*' -# 'http://www.statmt.org/wmt13/training-parallel-un.tgz' -# 'un/undoc.2000.fr-en.en' 'un/undoc.2000.fr-en.fr' -# ) -# # each of DEV_TEST_DATA: data_url data_tgz data_file_lang1 data_file_lang2 -# DEV_TEST_DATA=( -# 'http://data.statmt.org/wmt16/translation-task/dev.tgz' -# '.*/newstest201[45]-fren-ref.en.sgm' '.*/newstest201[45]-fren-src.fr.sgm' -# 'http://data.statmt.org/wmt16/translation-task/test.tgz' -# '.*/newstest2016-fren-ref.en.sgm' '.*/newstest2016-fren-src.fr.sgm' -# ) -############################################################################### - -mkdir -p $OUTPUT_DIR_DATA $OUTPUT_DIR_BPE_DATA - -# Extract training data -for ((i=0;i<${#TRAIN_DATA[@]};i+=3)); do - data_url=${TRAIN_DATA[i]} - data_tgz=${data_url##*/} # training-parallel-commoncrawl.tgz - data=${data_tgz%.*} # training-parallel-commoncrawl - data_lang1=${TRAIN_DATA[i+1]} - data_lang2=${TRAIN_DATA[i+2]} - if [ ! -e ${OUTPUT_DIR_DATA}/${data_tgz} ]; then - echo "Download "${data_url} - wget -O ${OUTPUT_DIR_DATA}/${data_tgz} ${data_url} - fi - - if [ ! -d ${OUTPUT_DIR_DATA}/${data} ]; then - echo "Extract "${data_tgz} - mkdir -p ${OUTPUT_DIR_DATA}/${data} - tar_type=${data_tgz:0-3} - if [ ${tar_type} == "tar" ]; then - tar -xvf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data} - else - tar -xvzf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data} - fi - fi - # concatenate all training data - for data_lang in $data_lang1 $data_lang2; do - for f in `find ${OUTPUT_DIR_DATA}/${data} -regex ".*/${data_lang}"`; do - data_dir=`dirname $f` - data_file=`basename $f` - f_base=${f%.*} - f_ext=${f##*.} - if [ $f_ext == "gz" ]; then - gunzip $f - l=${f_base##*.} - f_base=${f_base%.*} - else - l=${f_ext} - fi - - if [ $i -eq 0 ]; then - cat ${f_base}.$l > ${OUTPUT_DIR_DATA}/train.$l - else - cat ${f_base}.$l >> ${OUTPUT_DIR_DATA}/train.$l - fi - done - done -done - -# Clone mosesdecoder -if [ ! -d ${OUTPUT_DIR}/mosesdecoder ]; then - echo "Cloning moses for data processing" - git clone https://github.com/moses-smt/mosesdecoder.git ${OUTPUT_DIR}/mosesdecoder -fi - -# Extract develop and test data -dev_test_data="" -for ((i=0;i<${#DEV_TEST_DATA[@]};i+=3)); do - data_url=${DEV_TEST_DATA[i]} - data_tgz=${data_url##*/} # training-parallel-commoncrawl.tgz - data=${data_tgz%.*} # training-parallel-commoncrawl - data_lang1=${DEV_TEST_DATA[i+1]} - data_lang2=${DEV_TEST_DATA[i+2]} - if [ ! -e ${OUTPUT_DIR_DATA}/${data_tgz} ]; then - echo "Download "${data_url} - wget -O ${OUTPUT_DIR_DATA}/${data_tgz} ${data_url} - fi - - if [ ! -d ${OUTPUT_DIR_DATA}/${data} ]; then - echo "Extract "${data_tgz} - mkdir -p ${OUTPUT_DIR_DATA}/${data} - tar_type=${data_tgz:0-3} - if [ ${tar_type} == "tar" ]; then - tar -xvf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data} - else - tar -xvzf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data} - fi - fi - - for data_lang in $data_lang1 $data_lang2; do - for f in `find ${OUTPUT_DIR_DATA}/${data} -regex ".*/${data_lang}"`; do - data_dir=`dirname $f` - data_file=`basename $f` - data_out=`echo ${data_file} | cut -d '-' -f 1` # newstest2016 - l=`echo ${data_file} | cut -d '.' -f 2` # en - dev_test_data="${dev_test_data}\|${data_out}" # to make regexp - if [ ! -e ${OUTPUT_DIR_DATA}/${data_out}.$l ]; then - ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ - < $f > ${OUTPUT_DIR_DATA}/${data_out}.$l - fi - done - done -done - -# Tokenize data -for l in ${LANG1} ${LANG2}; do - for f in `ls ${OUTPUT_DIR_DATA}/*.$l | grep "\(train${dev_test_data}\)\.$l$"`; do - f_base=${f%.*} # dir/train dir/newstest2016 - f_out=$f_base.tok.$l - if [ ! -e $f_out ]; then - echo "Tokenize "$f - ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l $l -threads 8 < $f > $f_out - fi - done -done - -# Clean data -for f in ${OUTPUT_DIR_DATA}/train.${LANG1} ${OUTPUT_DIR_DATA}/train.tok.${LANG1}; do - f_base=${f%.*} # dir/train dir/train.tok - f_out=${f_base}.clean - if [ ! -e $f_out.${LANG1} ] && [ ! -e $f_out.${LANG2} ]; then - echo "Clean "${f_base} - ${OUTPUT_DIR}/mosesdecoder/scripts/training/clean-corpus-n.perl $f_base ${LANG1} ${LANG2} ${f_out} 1 80 - fi -done - -# Clone subword-nmt and generate BPE data -if [ ! -d ${OUTPUT_DIR}/subword-nmt ]; then - git clone https://github.com/rsennrich/subword-nmt.git ${OUTPUT_DIR}/subword-nmt -fi - -# Generate BPE data and vocabulary -for num_operations in 32000; do - if [ ! -e ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations} ]; then - echo "Learn BPE with ${num_operations} merge operations" - cat ${OUTPUT_DIR_DATA}/train.tok.clean.${LANG1} ${OUTPUT_DIR_DATA}/train.tok.clean.${LANG2} | \ - ${OUTPUT_DIR}/subword-nmt/learn_bpe.py -s $num_operations > ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations} - fi - - for l in ${LANG1} ${LANG2}; do - for f in `ls ${OUTPUT_DIR_DATA}/*.$l | grep "\(train${dev_test_data}\)\.tok\(\.clean\)\?\.$l$"`; do - f_base=${f%.*} # dir/train.tok dir/train.tok.clean dir/newstest2016.tok - f_base=${f_base##*/} # train.tok train.tok.clean newstest2016.tok - f_out=${OUTPUT_DIR_BPE_DATA}/${f_base}.bpe.${num_operations}.$l - if [ ! -e $f_out ]; then - echo "Apply BPE to "$f - ${OUTPUT_DIR}/subword-nmt/apply_bpe.py -c ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations} < $f > $f_out - fi - done - done - - if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations} ]; then - echo "Create vocabulary for BPE data" - cat ${OUTPUT_DIR_BPE_DATA}/train.tok.clean.bpe.${num_operations}.${LANG1} ${OUTPUT_DIR_BPE_DATA}/train.tok.clean.bpe.${num_operations}.${LANG2} | \ - ${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations} - fi -done - -# Adapt to the reader -for f in ${OUTPUT_DIR_BPE_DATA}/*.bpe.${num_operations}.${LANG1}; do - f_base=${f%.*} # dir/train.tok.clean.bpe.32000 dir/newstest2016.tok.bpe.32000 - f_out=${f_base}.${LANG1}-${LANG2} - if [ ! -e $f_out ]; then - paste -d '\t' $f_base.${LANG1} $f_base.${LANG2} > $f_out - fi -done -if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab_all.bpe.${num_operations} ]; then - sed '1i\\n\n' ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations} > ${OUTPUT_DIR_BPE_DATA}/vocab_all.bpe.${num_operations} -fi - -echo "All done." diff --git a/PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png b/PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png deleted file mode 100644 index 427fb6b32aaeb7013066a167aab4fb97c024c2d6..0000000000000000000000000000000000000000 Binary files a/PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png and /dev/null differ diff --git a/PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png b/PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png deleted file mode 100644 index 34be0e5c7e2b08f858683d86353db5e81049c7ca..0000000000000000000000000000000000000000 Binary files a/PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png and /dev/null differ diff --git a/PaddleNLP/neural_machine_translation/transformer/inference_model.py b/PaddleNLP/neural_machine_translation/transformer/inference_model.py deleted file mode 100644 index d1b88f5be965f85eac1bc703ee10f7cca57bcb78..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/inference_model.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import six -import sys -import time - -import numpy as np -import paddle -import paddle.fluid as fluid - -#include palm for easier nlp coding -from palm.toolkit.input_field import InputField -from palm.toolkit.configure import PDConfig - -# include task-specific libs -import desc -import reader -from transformer import create_net - - -def init_from_pretrain_model(args, exe, program): - - assert isinstance(args.init_from_pretrain_model, str) - - if not os.path.exists(args.init_from_pretrain_model): - raise Warning("The pretrained params do not exist.") - return False - - def existed_params(var): - if not isinstance(var, fluid.framework.Parameter): - return False - return os.path.exists( - os.path.join(args.init_from_pretrain_model, var.name)) - - fluid.io.load_vars( - exe, - args.init_from_pretrain_model, - main_program=program, - predicate=existed_params) - - print("finish initing model from pretrained params from %s" % - (args.init_from_pretrain_model)) - - return True - - -def init_from_params(args, exe, program): - - assert isinstance(args.init_from_params, str) - - if not os.path.exists(args.init_from_params): - raise Warning("the params path does not exist.") - return False - - fluid.io.load_params( - executor=exe, - dirname=args.init_from_params, - main_program=program, - filename="params.pdparams") - - print("finish init model from params from %s" % (args.init_from_params)) - - return True - - -def do_save_inference_model(args): - if args.use_cuda: - dev_count = fluid.core.get_cuda_device_count() - place = fluid.CUDAPlace(0) - else: - dev_count = int(os.environ.get('CPU_NUM', 1)) - place = fluid.CPUPlace() - - test_prog = fluid.default_main_program() - startup_prog = fluid.default_startup_program() - - with fluid.program_guard(test_prog, startup_prog): - with fluid.unique_name.guard(): - - # define input and reader - - input_field_names = desc.encoder_data_input_fields + desc.fast_decoder_data_input_fields - input_slots = [{ - "name": name, - "shape": desc.input_descs[name][0], - "dtype": desc.input_descs[name][1] - } for name in input_field_names] - - input_field = InputField(input_slots) - input_field.build(build_pyreader=True) - - # define the network - - predictions = create_net( - is_training=False, model_input=input_field, args=args) - out_ids, out_scores = predictions - - # This is used here to set dropout to the test mode. - test_prog = test_prog.clone(for_test=True) - - # prepare predicting - - ## define the executor and program for training - - exe = fluid.Executor(place) - - exe.run(startup_prog) - assert (args.init_from_params) or (args.init_from_pretrain_model) - - if args.init_from_params: - init_from_params(args, exe, test_prog) - - elif args.init_from_pretrain_model: - init_from_pretrain_model(args, exe, test_prog) - - # saving inference model - - fluid.io.save_inference_model( - args.inference_model_dir, - feeded_var_names=input_field_names, - target_vars=[out_ids, out_scores], - executor=exe, - main_program=test_prog, - model_filename="model.pdmodel", - params_filename="params.pdparams") - - print("save inference model at %s" % (args.inference_model_dir)) - - -if __name__ == "__main__": - args = PDConfig(yaml_file="./transformer.yaml") - args.build() - args.Print() - - do_save_inference_model(args) diff --git a/PaddleNLP/neural_machine_translation/transformer/main.py b/PaddleNLP/neural_machine_translation/transformer/main.py deleted file mode 100644 index 6ff929af0e72296bc635a56d90d2c0925b5bad68..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/main.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import logging - -import numpy as np -import paddle -import paddle.fluid as fluid - -#include palm for easier nlp coding -from palm.toolkit.configure import PDConfig - -from train import do_train -from predict import do_predict -from inference_model import do_save_inference_model - -if __name__ == "__main__": - LOG_FORMAT = "[%(asctime)s %(levelname)s %(filename)s:%(lineno)d] %(message)s" - logging.basicConfig( - stream=sys.stdout, level=logging.DEBUG, format=LOG_FORMAT) - logging.getLogger().setLevel(logging.INFO) - - args = PDConfig(yaml_file="./transformer.yaml") - args.build() - args.Print() - - if args.do_train: - do_train(args) - - if args.do_predict: - do_predict(args) - - if args.do_save_inference_model: - do_save_inference_model(args) \ No newline at end of file diff --git a/PaddleNLP/neural_machine_translation/transformer/predict.py b/PaddleNLP/neural_machine_translation/transformer/predict.py deleted file mode 100644 index 7ad847fd313ae688e04cea4373912280e220358a..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/predict.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import six -import sys -import time - -import numpy as np -import paddle -import paddle.fluid as fluid - -from utils.input_field import InputField -from utils.configure import PDConfig -from utils.check import check_gpu, check_version - -# include task-specific libs -import desc -import reader -from transformer import create_net, position_encoding_init - - -def init_from_pretrain_model(args, exe, program): - - assert isinstance(args.init_from_pretrain_model, str) - - if not os.path.exists(args.init_from_pretrain_model): - raise Warning("The pretrained params do not exist.") - return False - - def existed_params(var): - if not isinstance(var, fluid.framework.Parameter): - return False - return os.path.exists( - os.path.join(args.init_from_pretrain_model, var.name)) - - fluid.io.load_vars( - exe, - args.init_from_pretrain_model, - main_program=program, - predicate=existed_params) - - print("finish initing model from pretrained params from %s" % - (args.init_from_pretrain_model)) - - return True - - -def init_from_params(args, exe, program): - - assert isinstance(args.init_from_params, str) - - if not os.path.exists(args.init_from_params): - raise Warning("the params path does not exist.") - return False - - fluid.io.load_params( - executor=exe, - dirname=args.init_from_params, - main_program=program, - filename="params.pdparams") - - print("finish init model from params from %s" % (args.init_from_params)) - - return True - - -def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False): - """ - Post-process the beam-search decoded sequence. Truncate from the first - and remove the and tokens currently. - """ - eos_pos = len(seq) - 1 - for i, idx in enumerate(seq): - if idx == eos_idx: - eos_pos = i - break - seq = [ - idx for idx in seq[:eos_pos + 1] - if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx) - ] - return seq - - -def do_predict(args): - if args.use_cuda: - dev_count = fluid.core.get_cuda_device_count() - place = fluid.CUDAPlace(0) - else: - dev_count = int(os.environ.get('CPU_NUM', 1)) - place = fluid.CPUPlace() - # define the data generator - processor = reader.DataProcessor( - fpattern=args.predict_file, - src_vocab_fpath=args.src_vocab_fpath, - trg_vocab_fpath=args.trg_vocab_fpath, - token_delimiter=args.token_delimiter, - use_token_batch=False, - batch_size=args.batch_size, - device_count=dev_count, - pool_size=args.pool_size, - sort_type=reader.SortType.NONE, - shuffle=False, - shuffle_batch=False, - start_mark=args.special_token[0], - end_mark=args.special_token[1], - unk_mark=args.special_token[2], - max_length=args.max_length, - n_head=args.n_head) - batch_generator = processor.data_generator(phase="predict", place=place) - args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ - args.unk_idx = processor.get_vocab_summary() - trg_idx2word = reader.DataProcessor.load_dict( - dict_path=args.trg_vocab_fpath, reverse=True) - - test_prog = fluid.default_main_program() - startup_prog = fluid.default_startup_program() - - with fluid.program_guard(test_prog, startup_prog): - with fluid.unique_name.guard(): - - # define input and reader - - input_field_names = desc.encoder_data_input_fields + desc.fast_decoder_data_input_fields - input_slots = [{ - "name": name, - "shape": desc.input_descs[name][0], - "dtype": desc.input_descs[name][1] - } for name in input_field_names] - - input_field = InputField(input_slots) - input_field.build(build_pyreader=True) - - # define the network - - predictions = create_net( - is_training=False, model_input=input_field, args=args) - out_ids, out_scores = predictions - - # This is used here to set dropout to the test mode. - test_prog = test_prog.clone(for_test=True) - - # prepare predicting - - ## define the executor and program for training - - exe = fluid.Executor(place) - - exe.run(startup_prog) - assert (args.init_from_params) or (args.init_from_pretrain_model) - - if args.init_from_params: - init_from_params(args, exe, test_prog) - - elif args.init_from_pretrain_model: - init_from_pretrain_model(args, exe, test_prog) - - # to avoid a longer length than training, reset the size of position encoding to max_length - for pos_enc_param_name in desc.pos_enc_param_names: - pos_enc_param = fluid.global_scope().find_var( - pos_enc_param_name).get_tensor() - - pos_enc_param.set( - position_encoding_init(args.max_length + 1, args.d_model), place) - - exe_strategy = fluid.ExecutionStrategy() - # to clear tensor array after each iteration - exe_strategy.num_iteration_per_drop_scope = 1 - compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel( - exec_strategy=exe_strategy, places=place) - - f = open(args.output_file, "wb") - # start predicting - ## decorate the pyreader with batch_generator - input_field.loader.set_batch_generator(batch_generator) - input_field.loader.start() - while True: - try: - seq_ids, seq_scores = exe.run( - compiled_test_prog, - fetch_list=[out_ids.name, out_scores.name], - return_numpy=False) - - # How to parse the results: - # Suppose the lod of seq_ids is: - # [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]] - # then from lod[0]: - # there are 2 source sentences, beam width is 3. - # from lod[1]: - # the first source sentence has 3 hyps; the lengths are 12, 12, 16 - # the second source sentence has 3 hyps; the lengths are 14, 13, 15 - hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)] - scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)] - for i in range(len(seq_ids.lod()[0]) - - 1): # for each source sentence - start = seq_ids.lod()[0][i] - end = seq_ids.lod()[0][i + 1] - for j in range(end - start): # for each candidate - sub_start = seq_ids.lod()[1][start + j] - sub_end = seq_ids.lod()[1][start + j + 1] - hyps[i].append(b" ".join([ - trg_idx2word[idx] - for idx in post_process_seq( - np.array(seq_ids)[sub_start:sub_end], args.bos_idx, - args.eos_idx) - ])) - scores[i].append(np.array(seq_scores)[sub_end - 1]) - f.write(hyps[i][-1] + b"\n") - if len(hyps[i]) >= args.n_best: - break - except fluid.core.EOFException: - break - - f.close() - - -if __name__ == "__main__": - args = PDConfig(yaml_file="./transformer.yaml") - args.build() - args.Print() - check_gpu(args.use_cuda) - check_version() - - do_predict(args) diff --git a/PaddleNLP/neural_machine_translation/transformer/reader.py b/PaddleNLP/neural_machine_translation/transformer/reader.py deleted file mode 100644 index e69b4a252be9f7a2d684286cd6fb9009128a1c3f..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/reader.py +++ /dev/null @@ -1,567 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import glob -import six -import os -import tarfile - -import numpy as np -import paddle.fluid as fluid - - -def pad_batch_data(insts, - pad_idx, - n_head, - is_target=False, - is_label=False, - return_attn_bias=True, - return_max_len=True, - return_num_token=False): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - # Any token included in dict can be used to pad, since the paddings' loss - # will be masked out by weights and make no effect on parameter gradients. - inst_data = np.array( - [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, 1])] - if is_label: # label weight - inst_weight = np.array([[1.] * len(inst) + [0.] * (max_len - len(inst)) - for inst in insts]) - return_list += [inst_weight.astype("float32").reshape([-1, 1])] - else: # position data - inst_pos = np.array([ - list(range(0, len(inst))) + [0] * (max_len - len(inst)) - for inst in insts - ]) - return_list += [inst_pos.astype("int64").reshape([-1, 1])] - if return_attn_bias: - if is_target: - # This is used to avoid attention on paddings and subsequent - # words. - slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len)) - slf_attn_bias_data = np.triu(slf_attn_bias_data, - 1).reshape([-1, 1, max_len, max_len]) - slf_attn_bias_data = np.tile(slf_attn_bias_data, - [1, n_head, 1, 1]) * [-1e9] - else: - # This is used to avoid attention on paddings. - slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * - (max_len - len(inst)) - for inst in insts]) - slf_attn_bias_data = np.tile( - slf_attn_bias_data.reshape([-1, 1, 1, max_len]), - [1, n_head, max_len, 1]) - return_list += [slf_attn_bias_data.astype("float32")] - if return_max_len: - return_list += [max_len] - if return_num_token: - num_token = 0 - for inst in insts: - num_token += len(inst) - return_list += [num_token] - return return_list if len(return_list) > 1 else return_list[0] - - -def prepare_train_input(insts, src_pad_idx, trg_pad_idx, n_head): - """ - Put all padded data needed by training into a list. - """ - src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( - [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) - src_word = src_word.reshape(-1, src_max_len) - src_pos = src_pos.reshape(-1, src_max_len) - trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( - [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) - trg_word = trg_word.reshape(-1, trg_max_len) - trg_pos = trg_pos.reshape(-1, trg_max_len) - - trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], - [1, 1, trg_max_len, 1]).astype("float32") - - lbl_word, lbl_weight, num_token = pad_batch_data( - [inst[2] for inst in insts], - trg_pad_idx, - n_head, - is_target=False, - is_label=True, - return_attn_bias=False, - return_max_len=False, - return_num_token=True) - lbl_word = lbl_word.reshape(-1, 1) - lbl_weight = lbl_weight.reshape(-1, 1) - - data_inputs = [ - src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, - trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight - ] - - return data_inputs - - -def prepare_infer_input(insts, src_pad_idx, bos_idx, n_head, place): - """ - Put all padded data needed by beam search decoder into a list. - """ - src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( - [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) - # start tokens - trg_word = np.asarray([[bos_idx]] * len(insts), dtype="int64") - trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], - [1, 1, 1, 1]).astype("float32") - trg_word = trg_word.reshape(-1, 1) - src_word = src_word.reshape(-1, src_max_len) - src_pos = src_pos.reshape(-1, src_max_len) - - def to_lodtensor(data, place, lod=None): - data_tensor = fluid.LoDTensor() - data_tensor.set(data, place) - if lod is not None: - data_tensor.set_lod(lod) - return data_tensor - - # beamsearch_op must use tensors with lod - init_score = to_lodtensor( - np.zeros_like( - trg_word, dtype="float32").reshape(-1, 1), - place, [range(trg_word.shape[0] + 1)] * 2) - trg_word = to_lodtensor(trg_word, place, [range(trg_word.shape[0] + 1)] * 2) - init_idx = np.asarray(range(len(insts)), dtype="int32") - - data_inputs = [ - src_word, src_pos, src_slf_attn_bias, trg_word, init_score, init_idx, - trg_src_attn_bias - ] - return data_inputs - - -class SortType(object): - GLOBAL = 'global' - POOL = 'pool' - NONE = "none" - - -class Converter(object): - def __init__(self, vocab, beg, end, unk, delimiter, add_beg): - self._vocab = vocab - self._beg = beg - self._end = end - self._unk = unk - self._delimiter = delimiter - self._add_beg = add_beg - - def __call__(self, sentence): - return ([self._beg] if self._add_beg else []) + [ - self._vocab.get(w, self._unk) - for w in sentence.split(self._delimiter) - ] + [self._end] - - -class ComposedConverter(object): - def __init__(self, converters): - self._converters = converters - - def __call__(self, parallel_sentence): - return [ - self._converters[i](parallel_sentence[i]) - for i in range(len(self._converters)) - ] - - -class SentenceBatchCreator(object): - def __init__(self, batch_size): - self.batch = [] - self._batch_size = batch_size - - def append(self, info): - self.batch.append(info) - if len(self.batch) == self._batch_size: - tmp = self.batch - self.batch = [] - return tmp - - -class TokenBatchCreator(object): - def __init__(self, batch_size): - self.batch = [] - self.max_len = -1 - self._batch_size = batch_size - - def append(self, info): - cur_len = info.max_len - max_len = max(self.max_len, cur_len) - if max_len * (len(self.batch) + 1) > self._batch_size: - result = self.batch - self.batch = [info] - self.max_len = cur_len - return result - else: - self.max_len = max_len - self.batch.append(info) - - -class SampleInfo(object): - def __init__(self, i, max_len, min_len): - self.i = i - self.min_len = min_len - self.max_len = max_len - - -class MinMaxFilter(object): - def __init__(self, max_len, min_len, underlying_creator): - self._min_len = min_len - self._max_len = max_len - self._creator = underlying_creator - - def append(self, info): - if info.max_len > self._max_len or info.min_len < self._min_len: - return - else: - return self._creator.append(info) - - @property - def batch(self): - return self._creator.batch - - -class DataProcessor(object): - """ - The data reader loads all data from files and produces batches of data - in the way corresponding to settings. - - An example of returning a generator producing data batches whose data - is shuffled in each pass and sorted in each pool: - - ``` - train_data = DataProcessor( - src_vocab_fpath='data/src_vocab_file', - trg_vocab_fpath='data/trg_vocab_file', - fpattern='data/part-*', - use_token_batch=True, - batch_size=2000, - device_count=8, - n_head=8, - pool_size=10000, - sort_type=SortType.POOL, - shuffle=True, - shuffle_batch=True, - start_mark='', - end_mark='', - unk_mark='', - clip_last_batch=False).data_generator(phase='train') - ``` - - :param src_vocab_fpath: The path of vocabulary file of source language. - :type src_vocab_fpath: basestring - :param trg_vocab_fpath: The path of vocabulary file of target language. - :type trg_vocab_fpath: basestring - :param fpattern: The pattern to match data files. - :type fpattern: basestring - :param batch_size: The number of sequences contained in a mini-batch. - or the maximum number of tokens (include paddings) contained in a - mini-batch. - :type batch_size: int - :param pool_size: The size of pool buffer. - :type device_count: int - :param device_count: The number of devices. The actual batch size is - determined by both batch_size and device_count. - :type n_head: int - :param n_head: The number of head used in multi-head attention. Actually, - this is not a reader related argument, but is used for input data. - :type pool_size: int - :param sort_type: The grain to sort by length: 'global' for all - instances; 'pool' for instances in pool; 'none' for no sort. - :type sort_type: basestring - :param clip_last_batch: Whether to clip the last uncompleted batch. - :type clip_last_batch: bool - :param tar_fname: The data file in tar if fpattern matches a tar file. - :type tar_fname: basestring - :param min_length: The minimum length used to filt sequences. - :type min_length: int - :param max_length: The maximum length used to filt sequences. - :type max_length: int - :param shuffle: Whether to shuffle all instances. - :type shuffle: bool - :param shuffle_batch: Whether to shuffle the generated batches. - :type shuffle_batch: bool - :param use_token_batch: Whether to produce batch data according to - token number. - :type use_token_batch: bool - :param field_delimiter: The delimiter used to split source and target in - each line of data file. - :type field_delimiter: basestring - :param token_delimiter: The delimiter used to split tokens in source or - target sentences. - :type token_delimiter: basestring - :param start_mark: The token representing for the beginning of - sentences in dictionary. - :type start_mark: basestring - :param end_mark: The token representing for the end of sentences - in dictionary. - :type end_mark: basestring - :param unk_mark: The token representing for unknown word in dictionary. - :type unk_mark: basestring - :param only_src: Whether each line is a source and target sentence - pair or only has the source sentence. - :type only_src: bool - :param seed: The seed for random. - :type seed: int - """ - - def __init__(self, - src_vocab_fpath, - trg_vocab_fpath, - fpattern, - batch_size, - device_count, - n_head, - pool_size, - sort_type=SortType.GLOBAL, - clip_last_batch=False, - tar_fname=None, - min_length=0, - max_length=100, - shuffle=True, - shuffle_batch=False, - use_token_batch=False, - field_delimiter="\t", - token_delimiter=" ", - start_mark="", - end_mark="", - unk_mark="", - only_src=False, - seed=0): - # convert str to bytes, and use byte data - field_delimiter = field_delimiter.encode("utf8") - token_delimiter = token_delimiter.encode("utf8") - start_mark = start_mark.encode("utf8") - end_mark = end_mark.encode("utf8") - unk_mark = unk_mark.encode("utf8") - self._src_vocab = self.load_dict(src_vocab_fpath) - self._trg_vocab = self.load_dict(trg_vocab_fpath) - self._bos_idx = self._src_vocab[start_mark] - self._eos_idx = self._src_vocab[end_mark] - self._unk_idx = self._src_vocab[unk_mark] - self._only_src = only_src - self._pool_size = pool_size - self._batch_size = batch_size - self._device_count = device_count - self._n_head = n_head - self._use_token_batch = use_token_batch - self._sort_type = sort_type - self._clip_last_batch = clip_last_batch - self._shuffle = shuffle - self._shuffle_batch = shuffle_batch - self._min_length = min_length - self._max_length = max_length - self._field_delimiter = field_delimiter - self._token_delimiter = token_delimiter - self.load_src_trg_ids(fpattern, tar_fname) - self._random = np.random - self._random.seed(seed) - - def load_src_trg_ids(self, fpattern, tar_fname): - converters = [ - Converter( - vocab=self._src_vocab, - beg=self._bos_idx, - end=self._eos_idx, - unk=self._unk_idx, - delimiter=self._token_delimiter, - add_beg=False) - ] - if not self._only_src: - converters.append( - Converter( - vocab=self._trg_vocab, - beg=self._bos_idx, - end=self._eos_idx, - unk=self._unk_idx, - delimiter=self._token_delimiter, - add_beg=True)) - - converters = ComposedConverter(converters) - - self._src_seq_ids = [] - self._trg_seq_ids = None if self._only_src else [] - self._sample_infos = [] - - for i, line in enumerate(self._load_lines(fpattern, tar_fname)): - src_trg_ids = converters(line) - self._src_seq_ids.append(src_trg_ids[0]) - lens = [len(src_trg_ids[0])] - if not self._only_src: - self._trg_seq_ids.append(src_trg_ids[1]) - lens.append(len(src_trg_ids[1])) - self._sample_infos.append(SampleInfo(i, max(lens), min(lens))) - - def _load_lines(self, fpattern, tar_fname): - fpaths = glob.glob(fpattern) - assert len(fpaths) > 0, "no matching file to the provided data path" - - if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]): - if tar_fname is None: - raise Exception("If tar file provided, please set tar_fname.") - - f = tarfile.open(fpaths[0], "rb") - for line in f.extractfile(tar_fname): - fields = line.strip(b"\n").split(self._field_delimiter) - if (not self._only_src and len(fields) == 2) or ( - self._only_src and len(fields) == 1): - yield fields - else: - for fpath in fpaths: - if not os.path.isfile(fpath): - raise IOError("Invalid file: %s" % fpath) - - with open(fpath, "rb") as f: - for line in f: - fields = line.strip(b"\n").split(self._field_delimiter) - if (not self._only_src and len(fields) == 2) or ( - self._only_src and len(fields) == 1): - yield fields - - @staticmethod - def load_dict(dict_path, reverse=False): - word_dict = {} - with open(dict_path, "rb") as fdict: - for idx, line in enumerate(fdict): - if reverse: - word_dict[idx] = line.strip(b"\n") - else: - word_dict[line.strip(b"\n")] = idx - return word_dict - - def batch_generator(self, batch_size, use_token_batch): - def __impl__(): - # global sort or global shuffle - if self._sort_type == SortType.GLOBAL: - infos = sorted(self._sample_infos, key=lambda x: x.max_len) - else: - if self._shuffle: - infos = self._sample_infos - self._random.shuffle(infos) - else: - infos = self._sample_infos - - if self._sort_type == SortType.POOL: - reverse = True - for i in range(0, len(infos), self._pool_size): - # to avoid placing short next to long sentences - reverse = not reverse - infos[i:i + self._pool_size] = sorted( - infos[i:i + self._pool_size], - key=lambda x: x.max_len, - reverse=reverse) - - # concat batch - batches = [] - batch_creator = TokenBatchCreator( - batch_size) if use_token_batch else SentenceBatchCreator( - batch_size) - batch_creator = MinMaxFilter(self._max_length, self._min_length, - batch_creator) - - for info in infos: - batch = batch_creator.append(info) - if batch is not None: - batches.append(batch) - - if not self._clip_last_batch and len(batch_creator.batch) != 0: - batches.append(batch_creator.batch) - - if self._shuffle_batch: - self._random.shuffle(batches) - - for batch in batches: - batch_ids = [info.i for info in batch] - - if self._only_src: - yield [[self._src_seq_ids[idx]] for idx in batch_ids] - else: - yield [(self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1], - self._trg_seq_ids[idx][1:]) for idx in batch_ids] - - return __impl__ - - @staticmethod - def stack(data_reader, count, clip_last=True): - def __impl__(): - res = [] - for item in data_reader(): - res.append(item) - if len(res) == count: - yield res - res = [] - if len(res) == count: - yield res - elif not clip_last: - data = [] - for item in res: - data += item - if len(data) > count: - inst_num_per_part = len(data) // count - yield [ - data[inst_num_per_part * i:inst_num_per_part * (i + 1)] - for i in range(count) - ] - - return __impl__ - - @staticmethod - def split(data_reader, count): - def __impl__(): - for item in data_reader(): - inst_num_per_part = len(item) // count - for i in range(count): - yield item[inst_num_per_part * i:inst_num_per_part * (i + 1 - )] - - return __impl__ - - def data_generator(self, phase, place=None): - # Any token included in dict can be used to pad, since the paddings' loss - # will be masked out by weights and make no effect on parameter gradients. - src_pad_idx = trg_pad_idx = self._eos_idx - bos_idx = self._bos_idx - n_head = self._n_head - data_reader = self.batch_generator( - self._batch_size * - (1 if self._use_token_batch else self._device_count), - self._use_token_batch) - if not self._use_token_batch: - # to make data on each device have similar token number - data_reader = self.split(data_reader, self._device_count) - - def __for_train__(): - for data in data_reader(): - data_inputs = prepare_train_input(data, src_pad_idx, - trg_pad_idx, n_head) - yield data_inputs - - def __for_predict__(): - for data in data_reader(): - data_inputs = prepare_infer_input(data, src_pad_idx, bos_idx, - n_head, place) - yield data_inputs - - return __for_train__ if phase == "train" else __for_predict__ - - def get_vocab_summary(self): - return len(self._src_vocab), len( - self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx diff --git a/PaddleNLP/neural_machine_translation/transformer/train.py b/PaddleNLP/neural_machine_translation/transformer/train.py deleted file mode 100644 index 48b4847f68e849b109133d2d413b03e456e9825c..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/train.py +++ /dev/null @@ -1,327 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import six -import sys -import time - -import numpy as np -import paddle -import paddle.fluid as fluid - -import utils.dist_utils as dist_utils -from utils.input_field import InputField -from utils.configure import PDConfig -from utils.check import check_gpu, check_version - -# include task-specific libs -import desc -import reader -from transformer import create_net, position_encoding_init - -if os.environ.get('FLAGS_eager_delete_tensor_gb', None) is None: - os.environ['FLAGS_eager_delete_tensor_gb'] = '0' -# num_trainers is used for multi-process gpu training -num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) - - -def init_from_pretrain_model(args, exe, program): - - assert isinstance(args.init_from_pretrain_model, str) - - if not os.path.exists(args.init_from_pretrain_model): - raise Warning("The pretrained params do not exist.") - return False - - def existed_params(var): - if not isinstance(var, fluid.framework.Parameter): - return False - return os.path.exists( - os.path.join(args.init_from_pretrain_model, var.name)) - - fluid.io.load_vars( - exe, - args.init_from_pretrain_model, - main_program=program, - predicate=existed_params) - - print("finish initing model from pretrained params from %s" % - (args.init_from_pretrain_model)) - - return True - - -def init_from_checkpoint(args, exe, program): - - assert isinstance(args.init_from_checkpoint, str) - - if not os.path.exists(args.init_from_checkpoint): - raise Warning("the checkpoint path does not exist.") - return False - - fluid.io.load_persistables( - executor=exe, - dirname=args.init_from_checkpoint, - main_program=program, - filename="checkpoint.pdckpt") - - print("finish initing model from checkpoint from %s" % - (args.init_from_checkpoint)) - - return True - - -def save_checkpoint(args, exe, program, dirname): - - assert isinstance(args.save_model_path, str) - - checkpoint_dir = os.path.join(args.save_model_path, args.save_checkpoint) - - if not os.path.exists(checkpoint_dir): - os.mkdir(checkpoint_dir) - - fluid.io.save_persistables( - exe, - os.path.join(checkpoint_dir, dirname), - main_program=program, - filename="checkpoint.pdparams") - - print("save checkpoint at %s" % (os.path.join(checkpoint_dir, dirname))) - - return True - - -def save_param(args, exe, program, dirname): - - assert isinstance(args.save_model_path, str) - - param_dir = os.path.join(args.save_model_path, args.save_param) - - if not os.path.exists(param_dir): - os.mkdir(param_dir) - - fluid.io.save_params( - exe, - os.path.join(param_dir, dirname), - main_program=program, - filename="params.pdparams") - print("save parameters at %s" % (os.path.join(param_dir, dirname))) - - return True - - -def do_train(args): - if args.use_cuda: - if num_trainers > 1: # for multi-process gpu training - dev_count = 1 - else: - dev_count = fluid.core.get_cuda_device_count() - gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) - place = fluid.CUDAPlace(gpu_id) - else: - dev_count = int(os.environ.get('CPU_NUM', 1)) - place = fluid.CPUPlace() - - # define the data generator - processor = reader.DataProcessor( - fpattern=args.training_file, - src_vocab_fpath=args.src_vocab_fpath, - trg_vocab_fpath=args.trg_vocab_fpath, - token_delimiter=args.token_delimiter, - use_token_batch=args.use_token_batch, - batch_size=args.batch_size, - device_count=dev_count, - pool_size=args.pool_size, - sort_type=args.sort_type, - shuffle=args.shuffle, - shuffle_batch=args.shuffle_batch, - start_mark=args.special_token[0], - end_mark=args.special_token[1], - unk_mark=args.special_token[2], - max_length=args.max_length, - n_head=args.n_head) - batch_generator = processor.data_generator(phase="train") - if num_trainers > 1: # for multi-process gpu training - batch_generator = fluid.contrib.reader.distributed_batch_reader( - batch_generator) - args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ - args.unk_idx = processor.get_vocab_summary() - - train_prog = fluid.default_main_program() - startup_prog = fluid.default_startup_program() - random_seed = eval(str(args.random_seed)) - if random_seed is not None: - train_prog.random_seed = random_seed - startup_prog.random_seed = random_seed - - with fluid.program_guard(train_prog, startup_prog): - with fluid.unique_name.guard(): - - # define input and reader - - input_field_names = desc.encoder_data_input_fields + \ - desc.decoder_data_input_fields[:-1] + desc.label_data_input_fields - input_slots = [{ - "name": name, - "shape": desc.input_descs[name][0], - "dtype": desc.input_descs[name][1] - } for name in input_field_names] - - input_field = InputField(input_slots) - input_field.build(build_pyreader=True) - - # define the network - - sum_cost, avg_cost, token_num = create_net( - is_training=True, model_input=input_field, args=args) - - # define the optimizer - - with fluid.default_main_program()._lr_schedule_guard(): - learning_rate = fluid.layers.learning_rate_scheduler.noam_decay( - args.d_model, args.warmup_steps) * args.learning_rate - - optimizer = fluid.optimizer.Adam( - learning_rate=learning_rate, - beta1=args.beta1, - beta2=args.beta2, - epsilon=float(args.eps)) - optimizer.minimize(avg_cost) - - # prepare training - - ## decorate the pyreader with batch_generator - input_field.loader.set_batch_generator(batch_generator) - - ## define the executor and program for training - - exe = fluid.Executor(place) - - exe.run(startup_prog) - # init position_encoding - for pos_enc_param_name in desc.pos_enc_param_names: - pos_enc_param = fluid.global_scope().find_var( - pos_enc_param_name).get_tensor() - - pos_enc_param.set( - position_encoding_init(args.max_length + 1, args.d_model), place) - - assert (args.init_from_checkpoint == "") or ( - args.init_from_pretrain_model == "") - - ## init from some checkpoint, to resume the previous training - if args.init_from_checkpoint: - init_from_checkpoint(args, exe, train_prog) - - ## init from some pretrain models, to better solve the current task - if args.init_from_pretrain_model: - init_from_pretrain_model(args, exe, train_prog) - - build_strategy = fluid.compiler.BuildStrategy() - build_strategy.enable_inplace = True - exec_strategy = fluid.ExecutionStrategy() - if num_trainers > 1: - dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) - exec_strategy.num_threads = 1 - - compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( - loss_name=avg_cost.name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) - - # the best cross-entropy value with label smoothing - loss_normalizer = -( - (1. - args.label_smooth_eps) * np.log( - (1. - args.label_smooth_eps)) + args.label_smooth_eps * - np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) - # start training - - step_idx = 0 - for pass_id in range(args.epoch): - pass_start_time = time.time() - input_field.loader.start() - - batch_id = 0 - while True: - try: - outs = exe.run(compiled_train_prog, - fetch_list=[sum_cost.name, token_num.name] - if step_idx % args.print_step == 0 else []) - - if step_idx % args.print_step == 0: - sum_cost_val, token_num_val = np.array(outs[0]), np.array( - outs[1]) - # sum the cost from multi-devices - total_sum_cost = sum_cost_val.sum() - total_token_num = token_num_val.sum() - total_avg_cost = total_sum_cost / total_token_num - - if step_idx == 0: - logging.info( - "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " - "normalized loss: %f, ppl: %f" % - (step_idx, pass_id, batch_id, total_avg_cost, - total_avg_cost - loss_normalizer, - np.exp([min(total_avg_cost, 100)]))) - avg_batch_time = time.time() - else: - logging.info( - "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " - "normalized loss: %f, ppl: %f, speed: %.2f step/s" % - (step_idx, pass_id, batch_id, total_avg_cost, - total_avg_cost - loss_normalizer, - np.exp([min(total_avg_cost, 100)]), - args.print_step / (time.time() - avg_batch_time))) - avg_batch_time = time.time() - - if step_idx % args.save_step == 0 and step_idx != 0: - - if args.save_checkpoint: - save_checkpoint(args, exe, train_prog, - "step_" + str(step_idx)) - - if args.save_param: - save_param(args, exe, train_prog, - "step_" + str(step_idx)) - - batch_id += 1 - step_idx += 1 - - except fluid.core.EOFException: - input_field.loader.reset() - break - - time_consumed = time.time() - pass_start_time - - if args.save_checkpoint: - save_checkpoint(args, exe, train_prog, "step_final") - - if args.save_param: - save_param(args, exe, train_prog, "step_final") - - if args.enable_ce: # For CE - print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost)) - print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed)) - - -if __name__ == "__main__": - args = PDConfig(yaml_file="./transformer.yaml") - args.build() - args.Print() - check_gpu(args.use_cuda) - check_version() - - do_train(args) diff --git a/PaddleNLP/neural_machine_translation/transformer/transformer.py b/PaddleNLP/neural_machine_translation/transformer/transformer.py deleted file mode 100644 index be20001b25fdb94fcc4bc234bae220413ddfacdd..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/transformer.py +++ /dev/null @@ -1,873 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import partial -import numpy as np - -import paddle.fluid as fluid -import paddle.fluid.layers as layers - -from desc import * - -# Set seed for CE or debug -dropout_seed = None - - -def wrap_layer_with_block(layer, block_idx): - """ - Make layer define support indicating block, by which we can add layers - to other blocks within current block. This will make it easy to define - cache among while loop. - """ - - class BlockGuard(object): - """ - BlockGuard class. - - BlockGuard class is used to switch to the given block in a program by - using the Python `with` keyword. - """ - - def __init__(self, block_idx=None, main_program=None): - self.main_program = fluid.default_main_program( - ) if main_program is None else main_program - self.old_block_idx = self.main_program.current_block().idx - self.new_block_idx = block_idx - - def __enter__(self): - self.main_program.current_block_idx = self.new_block_idx - - def __exit__(self, exc_type, exc_val, exc_tb): - self.main_program.current_block_idx = self.old_block_idx - if exc_type is not None: - return False # re-raise exception - return True - - def layer_wrapper(*args, **kwargs): - with BlockGuard(block_idx): - return layer(*args, **kwargs) - - return layer_wrapper - - -def position_encoding_init(n_position, d_pos_vec): - """ - Generate the initial values for the sinusoid position encoding table. - """ - channels = d_pos_vec - position = np.arange(n_position) - num_timescales = channels // 2 - log_timescale_increment = (np.log(float(1e4) / float(1)) / - (num_timescales - 1)) - inv_timescales = np.exp(np.arange( - num_timescales)) * -log_timescale_increment - scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, - 0) - signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) - signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') - position_enc = signal - return position_enc.astype("float32") - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - gather_idx=None, - static_kv=False): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): - raise ValueError( - "Inputs: quries, keys and values should all be 3-D tensors.") - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - bias_attr=False, - num_flatten_dims=2) - # For encoder-decoder attention in inference, insert the ops and vars - # into global block to use as cache among beam search. - fc_layer = wrap_layer_with_block( - layers.fc, fluid.default_main_program().current_block( - ).parent_idx) if cache is not None and static_kv else layers.fc - k = fc_layer( - input=keys, - size=d_key * n_head, - bias_attr=False, - num_flatten_dims=2) - v = fc_layer( - input=values, - size=d_value * n_head, - bias_attr=False, - num_flatten_dims=2) - return q, k, v - - def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Reshape input tensors at the last dimension to split multi-heads - and then transpose. Specifically, transform the input tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] to the output tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped_q = layers.reshape( - x=queries, shape=[0, 0, n_head, d_key], inplace=True) - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) - # For encoder-decoder attention in inference, insert the ops and vars - # into global block to use as cache among beam search. - reshape_layer = wrap_layer_with_block( - layers.reshape, - fluid.default_main_program().current_block( - ).parent_idx) if cache is not None and static_kv else layers.reshape - transpose_layer = wrap_layer_with_block( - layers.transpose, - fluid.default_main_program().current_block(). - parent_idx) if cache is not None and static_kv else layers.transpose - reshaped_k = reshape_layer( - x=keys, shape=[0, 0, n_head, d_key], inplace=True) - k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3]) - reshaped_v = reshape_layer( - x=values, shape=[0, 0, n_head, d_value], inplace=True) - v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3]) - - if cache is not None: # only for faster inference - if static_kv: # For encoder-decoder attention in inference - cache_k, cache_v = cache["static_k"], cache["static_v"] - # To init the static_k and static_v in cache. - # Maybe we can use condition_op(if_else) to do these at the first - # step in while loop to replace these, however it might be less - # efficient. - static_cache_init = wrap_layer_with_block( - layers.assign, - fluid.default_main_program().current_block().parent_idx) - static_cache_init(k, cache_k) - static_cache_init(v, cache_v) - else: # For decoder self-attention in inference - cache_k, cache_v = cache["k"], cache["v"] - # gather cell states corresponding to selected parent - select_k = layers.gather(cache_k, index=gather_idx) - select_v = layers.gather(cache_v, index=gather_idx) - if not static_kv: - # For self attention in inference, use cache and concat time steps. - select_k = layers.concat([select_k, k], axis=2) - select_v = layers.concat([select_v, v], axis=2) - # update cell states(caches) cached in global block - layers.assign(select_k, cache_k) - layers.assign(select_v, cache_v) - return q, select_k, select_v - return q, k, v - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape( - x=trans_x, - shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], - inplace=True) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5) - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout( - weights, - dropout_prob=dropout_rate, - seed=dropout_seed, - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model, - dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - bias_attr=False, - num_flatten_dims=2) - return proj_out - - -def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act="relu") - if dropout_rate: - hidden = layers.dropout( - hidden, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) - out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2) - return out - - -def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out = layers.layer_norm( - out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.initializer.Constant(1.), - bias_attr=fluid.initializer.Constant(0.)) - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout( - out, - dropout_prob=dropout_rate, - seed=dropout_seed, - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def prepare_encoder_decoder(src_word, - src_pos, - src_vocab_size, - src_emb_dim, - src_max_len, - dropout_rate=0., - bos_idx=0, - word_emb_param_name=None, - pos_enc_param_name=None): - """Add word embeddings and position encodings. - The output tensor has a shape of: - [batch_size, max_src_length_in_batch, d_model]. - This module is used at the bottom of the encoder stacks. - """ - src_word_emb = fluid.embedding( - src_word, - size=[src_vocab_size, src_emb_dim], - padding_idx=bos_idx, # set embedding of bos to 0 - param_attr=fluid.ParamAttr(name=word_emb_param_name, - initializer=fluid.initializer.Normal( - 0., src_emb_dim**-0.5))) - - src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) - src_pos_enc = fluid.embedding(src_pos, - size=[src_max_len, src_emb_dim], - param_attr=fluid.ParamAttr( - name=pos_enc_param_name, trainable=False)) - src_pos_enc.stop_gradient = True - enc_input = src_word_emb + src_pos_enc - return layers.dropout( - enc_input, dropout_prob=dropout_rate, seed=dropout_seed, - is_test=False) if dropout_rate else enc_input - - -prepare_encoder = partial( - prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0]) -prepare_decoder = partial( - prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[1]) - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd="n", - postprocess_cmd="da"): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - attn_output = multi_head_attention( - pre_process_layer(enc_input, preprocess_cmd, - prepostprocess_dropout), None, None, attn_bias, d_key, - d_value, d_model, n_head, attention_dropout) - attn_output = post_process_layer(enc_input, attn_output, postprocess_cmd, - prepostprocess_dropout) - ffd_output = positionwise_feed_forward( - pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout), - d_inner_hid, d_model, relu_dropout) - return post_process_layer(attn_output, ffd_output, postprocess_cmd, - prepostprocess_dropout) - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd="n", - postprocess_cmd="da"): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer. - """ - for i in range(n_layer): - enc_output = encoder_layer( - enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, ) - enc_input = enc_output - enc_output = pre_process_layer(enc_output, preprocess_cmd, - prepostprocess_dropout) - return enc_output - - -def decoder_layer(dec_input, - enc_output, - slf_attn_bias, - dec_enc_attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - cache=None, - gather_idx=None): - """ The layer to be stacked in decoder part. - The structure of this module is similar to that in the encoder part except - a multi-head attention is added to implement encoder-decoder attention. - """ - slf_attn_output = multi_head_attention( - pre_process_layer(dec_input, preprocess_cmd, prepostprocess_dropout), - None, - None, - slf_attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - cache=cache, - gather_idx=gather_idx) - slf_attn_output = post_process_layer( - dec_input, - slf_attn_output, - postprocess_cmd, - prepostprocess_dropout, ) - enc_attn_output = multi_head_attention( - pre_process_layer(slf_attn_output, preprocess_cmd, - prepostprocess_dropout), - enc_output, - enc_output, - dec_enc_attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - cache=cache, - gather_idx=gather_idx, - static_kv=True) - enc_attn_output = post_process_layer( - slf_attn_output, - enc_attn_output, - postprocess_cmd, - prepostprocess_dropout, ) - ffd_output = positionwise_feed_forward( - pre_process_layer(enc_attn_output, preprocess_cmd, - prepostprocess_dropout), - d_inner_hid, - d_model, - relu_dropout, ) - dec_output = post_process_layer( - enc_attn_output, - ffd_output, - postprocess_cmd, - prepostprocess_dropout, ) - return dec_output - - -def decoder(dec_input, - enc_output, - dec_slf_attn_bias, - dec_enc_attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - caches=None, - gather_idx=None): - """ - The decoder is composed of a stack of identical decoder_layer layers. - """ - for i in range(n_layer): - dec_output = decoder_layer( - dec_input, - enc_output, - dec_slf_attn_bias, - dec_enc_attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - cache=None if caches is None else caches[i], - gather_idx=gather_idx) - dec_input = dec_output - dec_output = pre_process_layer(dec_output, preprocess_cmd, - prepostprocess_dropout) - return dec_output - - -def transformer(model_input, - src_vocab_size, - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - label_smooth_eps, - bos_idx=0, - is_test=False): - if weight_sharing: - assert src_vocab_size == trg_vocab_size, ( - "Vocabularies in source and target should be same for weight sharing." - ) - - enc_inputs = (model_input.src_word, model_input.src_pos, - model_input.src_slf_attn_bias) - dec_inputs = (model_input.trg_word, model_input.trg_pos, - model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias) - label = model_input.lbl_word - weights = model_input.lbl_weight - - enc_output = wrap_encoder(enc_inputs, - src_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - bos_idx=bos_idx) - - predict = wrap_decoder(dec_inputs, - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_output=enc_output) - - # Padding index do not contribute to the total loss. The weights is used to - # cancel padding index in calculating the loss. - if label_smooth_eps: - # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing - # the enforcement that the last dimension of label must be 1. - label = layers.label_smooth(label=layers.one_hot(input=label, - depth=trg_vocab_size), - epsilon=label_smooth_eps) - - cost = layers.softmax_with_cross_entropy( - logits=predict, - label=label, - soft_label=True if label_smooth_eps else False) - weighted_cost = layers.elementwise_mul(x=cost, y=weights, axis=0) - sum_cost = layers.reduce_sum(weighted_cost) - token_num = layers.reduce_sum(weights) - token_num.stop_gradient = True - avg_cost = sum_cost / token_num - return sum_cost, avg_cost, predict, token_num - - -def wrap_encoder(enc_inputs, - src_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - bos_idx=0): - """ - The wrapper assembles together all needed layers for the encoder. - """ - src_word, src_pos, src_slf_attn_bias = enc_inputs - enc_input = prepare_encoder( - src_word, - src_pos, - src_vocab_size, - d_model, - max_length, - prepostprocess_dropout, - bos_idx=bos_idx, - word_emb_param_name=word_emb_param_names[0]) - enc_output = encoder( - enc_input, - src_slf_attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, ) - return enc_output - - -def wrap_decoder(dec_inputs, - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_output=None, - caches=None, - gather_idx=None, - bos_idx=0): - """ - The wrapper assembles together all needed layers for the decoder. - """ - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs - - dec_input = prepare_decoder( - trg_word, - trg_pos, - trg_vocab_size, - d_model, - max_length, - prepostprocess_dropout, - bos_idx=bos_idx, - word_emb_param_name=word_emb_param_names[0] - if weight_sharing else word_emb_param_names[1]) - dec_output = decoder( - dec_input, - enc_output, - trg_slf_attn_bias, - trg_src_attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - caches=caches, - gather_idx=gather_idx) - # Reshape to 2D tensor to use GEMM instead of BatchedGEMM - dec_output = layers.reshape( - dec_output, shape=[-1, dec_output.shape[-1]], inplace=True) - if weight_sharing: - predict = layers.matmul( - x=dec_output, - y=fluid.default_main_program().global_block().var( - word_emb_param_names[0]), - transpose_y=True) - else: - predict = layers.fc(input=dec_output, - size=trg_vocab_size, - bias_attr=False) - if dec_inputs is None: - # Return probs for independent decoder program. - predict = layers.softmax(predict) - return predict - - -def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len, - n_layer, n_head, d_key, d_value, d_model, d_inner_hid, - prepostprocess_dropout, attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd, weight_sharing, beam_size, - max_out_len, bos_idx, eos_idx): - """ - Use beam search to decode. Caches will be used to store states of history - steps which can make the decoding faster. - """ - enc_inputs = (model_input.src_word, model_input.src_pos, - model_input.src_slf_attn_bias) - dec_inputs = (model_input.trg_word, model_input.init_score, - model_input.init_idx, model_input.trg_src_attn_bias) - - enc_output = wrap_encoder(enc_inputs, - src_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - bos_idx=bos_idx) - start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs - - def beam_search(): - max_len = layers.fill_constant( - shape=[1], - dtype=start_tokens.dtype, - value=max_out_len, - force_cpu=True) - step_idx = layers.fill_constant( - shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) - cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True - while_op = layers.While(cond) - # array states will be stored for each step. - ids = layers.array_write( - layers.reshape(start_tokens, (-1, 1)), step_idx) - scores = layers.array_write(init_scores, step_idx) - # cell states will be overwrited at each step. - # caches contains states of history steps in decoder self-attention - # and static encoder output projections in encoder-decoder attention - # to reduce redundant computation. - caches = [ - { - "k": # for self attention - layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, n_head, 0, d_key], - dtype=enc_output.dtype, - value=0), - "v": # for self attention - layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, n_head, 0, d_value], - dtype=enc_output.dtype, - value=0), - "static_k": # for encoder-decoder attention - layers.create_tensor(dtype=enc_output.dtype), - "static_v": # for encoder-decoder attention - layers.create_tensor(dtype=enc_output.dtype) - } for i in range(n_layer) - ] - - with while_op.block(): - pre_ids = layers.array_read(array=ids, i=step_idx) - # Since beam_search_op dosen't enforce pre_ids' shape, we can do - # inplace reshape here which actually change the shape of pre_ids. - # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) - pre_scores = layers.array_read(array=scores, i=step_idx) - # gather cell states corresponding to selected parent - pre_src_attn_bias = layers.gather( - trg_src_attn_bias, index=parent_idx) - pre_pos = layers.elementwise_mul( - x=layers.fill_constant_batch_size_like( - input=pre_src_attn_bias, # cann't use lod tensor here - value=1, - shape=[-1, 1], - dtype=pre_ids.dtype), - y=step_idx, - axis=0) - logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias), - trg_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_output=enc_output, - caches=caches, - gather_idx=parent_idx, - bos_idx=bos_idx) - # intra-beam topK - topk_scores, topk_indices = layers.topk( - input=layers.softmax(logits), k=beam_size) - accu_scores = layers.elementwise_add( - x=layers.log(topk_scores), y=pre_scores, axis=0) - # beam_search op uses lod to differentiate branches. - accu_scores = layers.lod_reset(accu_scores, pre_ids) - # topK reduction across beams, also contain special handle of - # end beams and end sentences(batch reduction) - selected_ids, selected_scores, gather_idx = layers.beam_search( - pre_ids=pre_ids, - pre_scores=pre_scores, - ids=topk_indices, - scores=accu_scores, - beam_size=beam_size, - end_id=eos_idx, - return_parent_idx=True) - layers.increment(x=step_idx, value=1.0, in_place=True) - # cell states(caches) have been updated in wrap_decoder, - # only need to update beam search states here. - layers.array_write(selected_ids, i=step_idx, array=ids) - layers.array_write(selected_scores, i=step_idx, array=scores) - layers.assign(gather_idx, parent_idx) - layers.assign(pre_src_attn_bias, trg_src_attn_bias) - length_cond = layers.less_than(x=step_idx, y=max_len) - finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) - layers.logical_and(x=length_cond, y=finish_cond, out=cond) - - finished_ids, finished_scores = layers.beam_search_decode( - ids, scores, beam_size=beam_size, end_id=eos_idx) - return finished_ids, finished_scores - - finished_ids, finished_scores = beam_search() - return finished_ids, finished_scores - - -def create_net(is_training, model_input, args): - if is_training: - sum_cost, avg_cost, _, token_num = transformer( - model_input, args.src_vocab_size, args.trg_vocab_size, - args.max_length + 1, args.n_layer, args.n_head, args.d_key, - args.d_value, args.d_model, args.d_inner_hid, - args.prepostprocess_dropout, args.attention_dropout, - args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, - args.weight_sharing, args.label_smooth_eps, args.bos_idx) - return sum_cost, avg_cost, token_num - else: - out_ids, out_scores = fast_decode( - model_input, args.src_vocab_size, args.trg_vocab_size, - args.max_length + 1, args.n_layer, args.n_head, args.d_key, - args.d_value, args.d_model, args.d_inner_hid, - args.prepostprocess_dropout, args.attention_dropout, - args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, - args.weight_sharing, args.beam_size, args.max_out_len, args.bos_idx, - args.eos_idx) - return out_ids, out_scores diff --git a/PaddleNLP/neural_machine_translation/transformer/transformer.yaml b/PaddleNLP/neural_machine_translation/transformer/transformer.yaml deleted file mode 100644 index c6cbc074ed8a76c8b4d649e7631f0c125e165511..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/transformer.yaml +++ /dev/null @@ -1,111 +0,0 @@ -# used for continuous evaluation -enable_ce: False - -# The frequency to save trained models when training. -save_step: 10000 -# The frequency to fetch and print output when training. -print_step: 100 -# path of the checkpoint, to resume the previous training -init_from_checkpoint: "" -# path of the pretrain model, to better solve the current task -init_from_pretrain_model: "" -# path of trained parameter, to make prediction -init_from_params: "trained_params/step_100000" -save_model_path: "" -# the directory for saving checkpoints. -save_checkpoint: "trained_ckpts" -# the directory for saving trained parameters. -save_param: "trained_params" -# the directory for saving inference model. -inference_model_dir: "infer_model" -# Set seed for CE or debug -random_seed: None -# The pattern to match training data files. -training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de" -# The pattern to match test data files. -predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de" -# The file to output the translation results of predict_file to. -output_file: "predict.txt" -# The path of vocabulary file of source language. -src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" -# The path of vocabulary file of target language. -trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" -# The , and tokens in the dictionary. -special_token: ["", "", ""] - -# whether to use cuda -use_cuda: True - -# args for reader, see reader.py for details -token_delimiter: " " -use_token_batch: True -pool_size: 200000 -sort_type: "pool" -shuffle: True -shuffle_batch: True -batch_size: 4096 - -# Hyparams for training: -# the number of epoches for training -epoch: 30 -# the hyper parameters for Adam optimizer. -# This static learning_rate will be multiplied to the LearningRateScheduler -# derived learning rate the to get the final learning rate. -learning_rate: 2.0 -beta1: 0.9 -beta2: 0.997 -eps: 1e-9 -# the parameters for learning rate scheduling. -warmup_steps: 8000 -# the weight used to mix up the ground-truth distribution and the fixed -# uniform distribution in label smoothing when training. -# Set this as zero if label smoothing is not wanted. -label_smooth_eps: 0.1 - -# Hyparams for generation: -# the parameters for beam search. -beam_size: 5 -max_out_len: 256 -# the number of decoded sentences to output. -n_best: 1 - -# Hyparams for model: -# These following five vocabularies related configurations will be set -# automatically according to the passed vocabulary path and special tokens. -# size of source word dictionary. -src_vocab_size: 10000 -# size of target word dictionay -trg_vocab_size: 10000 -# index for token -bos_idx: 0 -# index for token -eos_idx: 1 -# index for token -unk_idx: 2 -# max length of sequences deciding the size of position encoding table. -max_length: 256 -# the dimension for word embeddings, which is also the last dimension of -# the input and output of multi-head attention, position-wise feed-forward -# networks, encoder and decoder. -d_model: 512 -# size of the hidden layer in position-wise feed-forward networks. -d_inner_hid: 2048 -# the dimension that keys are projected to for dot-product attention. -d_key: 64 -# the dimension that values are projected to for dot-product attention. -d_value: 64 -# number of head used in multi-head attention. -n_head: 8 -# number of sub-layers to be stacked in the encoder and decoder. -n_layer: 6 -# dropout rates of different modules. -prepostprocess_dropout: 0.1 -attention_dropout: 0.1 -relu_dropout: 0.1 -# to process before each sub-layer -preprocess_cmd: "n" # layer normalization -# to process after each sub-layer -postprocess_cmd: "da" # dropout + residual connection -# the flag indicating whether to share embedding and softmax weights. -# vocabularies in source and target should be same for weight sharing. -weight_sharing: True diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/__init__.py b/PaddleNLP/neural_machine_translation/transformer/utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/check.py b/PaddleNLP/neural_machine_translation/transformer/utils/check.py deleted file mode 100644 index 305fa3705f5c313569986cbdb15c8afeda5a79c1..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/utils/check.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import sys - -import paddle.fluid as fluid - -import logging -logger = logging.getLogger(__name__) - -__all__ = ['check_gpu', 'check_version'] - - -def check_gpu(use_gpu): - """ - Log error and exit when set use_gpu=true in paddlepaddle - cpu version. - """ - err = "Config use_gpu cannot be set as true while you are " \ - "using paddlepaddle cpu version ! \nPlease try: \n" \ - "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ - "\t2. Set use_gpu as false in config file to run " \ - "model on CPU" - - try: - if use_gpu and not fluid.is_compiled_with_cuda(): - logger.error(err) - sys.exit(1) - except Exception as e: - pass - - -def check_version(): - """ - Log error and exit when the installed version of paddlepaddle is - not satisfied. - """ - err = "PaddlePaddle version 1.6 or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ - "Please make sure the version is good with your code." \ - - try: - fluid.require_version('1.6.0') - except Exception as e: - logger.error(err) - sys.exit(1) diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/configure.py b/PaddleNLP/neural_machine_translation/transformer/utils/configure.py deleted file mode 100644 index 2ea9fd96817f461889d24cbbd0c5d9ae76585a0a..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/utils/configure.py +++ /dev/null @@ -1,345 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import argparse -import json -import yaml -import six -import logging - -logging_only_message = "%(message)s" -logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s" - - -class JsonConfig(object): - """ - A high-level api for handling json configure file. - """ - - def __init__(self, config_path): - self._config_dict = self._parse(config_path) - - def _parse(self, config_path): - try: - with open(config_path) as json_file: - config_dict = json.load(json_file) - except: - raise IOError("Error in parsing bert model config file '%s'" % - config_path) - else: - return config_dict - - def __getitem__(self, key): - return self._config_dict[key] - - def print_config(self): - for arg, value in sorted(six.iteritems(self._config_dict)): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -class ArgumentGroup(object): - def __init__(self, parser, title, des): - self._group = parser.add_argument_group(title=title, description=des) - - def add_arg(self, name, type, default, help, **kwargs): - type = str2bool if type == bool else type - self._group.add_argument( - "--" + name, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -class ArgConfig(object): - """ - A high-level api for handling argument configs. - """ - - def __init__(self): - parser = argparse.ArgumentParser() - - train_g = ArgumentGroup(parser, "training", "training options.") - train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") - train_g.add_arg("learning_rate", float, 5e-5, - "Learning rate used to train with warmup.") - train_g.add_arg( - "lr_scheduler", - str, - "linear_warmup_decay", - "scheduler of learning rate.", - choices=['linear_warmup_decay', 'noam_decay']) - train_g.add_arg("weight_decay", float, 0.01, - "Weight decay rate for L2 regularizer.") - train_g.add_arg( - "warmup_proportion", float, 0.1, - "Proportion of training steps to perform linear learning rate warmup for." - ) - train_g.add_arg("save_steps", int, 1000, - "The steps interval to save checkpoints.") - train_g.add_arg("use_fp16", bool, False, - "Whether to use fp16 mixed precision training.") - train_g.add_arg( - "loss_scaling", float, 1.0, - "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled." - ) - train_g.add_arg("pred_dir", str, None, - "Path to save the prediction results") - - log_g = ArgumentGroup(parser, "logging", "logging related.") - log_g.add_arg("skip_steps", int, 10, - "The steps interval to print loss.") - log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") - - run_type_g = ArgumentGroup(parser, "run_type", "running type options.") - run_type_g.add_arg("use_cuda", bool, True, - "If set, use GPU for training.") - run_type_g.add_arg( - "use_fast_executor", bool, False, - "If set, use fast parallel executor (in experiment).") - run_type_g.add_arg( - "num_iteration_per_drop_scope", int, 1, - "Ihe iteration intervals to clean up temporary variables.") - run_type_g.add_arg("do_train", bool, True, - "Whether to perform training.") - run_type_g.add_arg("do_predict", bool, True, - "Whether to perform prediction.") - - custom_g = ArgumentGroup(parser, "customize", "customized options.") - - self.custom_g = custom_g - - self.parser = parser - - def add_arg(self, name, dtype, default, descrip): - self.custom_g.add_arg(name, dtype, default, descrip) - - def build_conf(self): - return self.parser.parse_args() - - -def str2bool(v): - # because argparse does not support to parse "true, False" as python - # boolean directly - return v.lower() in ("true", "t", "1") - - -def print_arguments(args, log=None): - if not log: - print('----------- Configuration Arguments -----------') - for arg, value in sorted(six.iteritems(vars(args))): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - else: - log.info('----------- Configuration Arguments -----------') - for arg, value in sorted(six.iteritems(vars(args))): - log.info('%s: %s' % (arg, value)) - log.info('------------------------------------------------') - - -class PDConfig(object): - """ - A high-level API for managing configuration files in PaddlePaddle. - Can jointly work with command-line-arugment, json files and yaml files. - """ - - def __init__(self, json_file="", yaml_file="", fuse_args=True): - """ - Init funciton for PDConfig. - json_file: the path to the json configure file. - yaml_file: the path to the yaml configure file. - fuse_args: if fuse the json/yaml configs with argparse. - """ - assert isinstance(json_file, str) - assert isinstance(yaml_file, str) - - if json_file != "" and yaml_file != "": - raise Warning( - "json_file and yaml_file can not co-exist for now. please only use one configure file type." - ) - return - - self.args = None - self.arg_config = {} - self.json_config = {} - self.yaml_config = {} - - parser = argparse.ArgumentParser() - - self.default_g = ArgumentGroup(parser, "default", "default options.") - self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.") - self.json_g = ArgumentGroup(parser, "json", "options from json.") - self.com_g = ArgumentGroup(parser, "custom", "customized options.") - - self.default_g.add_arg("do_train", bool, False, - "Whether to perform training.") - self.default_g.add_arg("do_predict", bool, False, - "Whether to perform predicting.") - self.default_g.add_arg("do_eval", bool, False, - "Whether to perform evaluating.") - self.default_g.add_arg("do_save_inference_model", bool, False, - "Whether to perform model saving for inference.") - - self.parser = parser - - if json_file != "": - self.load_json(json_file, fuse_args=fuse_args) - - if yaml_file: - self.load_yaml(yaml_file, fuse_args=fuse_args) - - def load_json(self, file_path, fuse_args=True): - - if not os.path.exists(file_path): - raise Warning("the json file %s does not exist." % file_path) - return - - with open(file_path, "r") as fin: - self.json_config = json.loads(fin.read()) - fin.close() - - if fuse_args: - for name in self.json_config: - if isinstance(self.json_config[name], list): - self.json_g.add_arg( - name, - type(self.json_config[name][0]), - self.json_config[name], - "This is from %s" % file_path, - nargs=len(self.json_config[name])) - continue - if not isinstance(self.json_config[name], int) \ - and not isinstance(self.json_config[name], float) \ - and not isinstance(self.json_config[name], str) \ - and not isinstance(self.json_config[name], bool): - - continue - - self.json_g.add_arg(name, - type(self.json_config[name]), - self.json_config[name], - "This is from %s" % file_path) - - def load_yaml(self, file_path, fuse_args=True): - - if not os.path.exists(file_path): - raise Warning("the yaml file %s does not exist." % file_path) - return - - with open(file_path, "r") as fin: - self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) - fin.close() - - if fuse_args: - for name in self.yaml_config: - if isinstance(self.yaml_config[name], list): - self.yaml_g.add_arg( - name, - type(self.yaml_config[name][0]), - self.yaml_config[name], - "This is from %s" % file_path, - nargs=len(self.yaml_config[name])) - continue - - if not isinstance(self.yaml_config[name], int) \ - and not isinstance(self.yaml_config[name], float) \ - and not isinstance(self.yaml_config[name], str) \ - and not isinstance(self.yaml_config[name], bool): - - continue - - self.yaml_g.add_arg(name, - type(self.yaml_config[name]), - self.yaml_config[name], - "This is from %s" % file_path) - - def build(self): - self.args = self.parser.parse_args() - self.arg_config = vars(self.args) - - def __add__(self, new_arg): - assert isinstance(new_arg, list) or isinstance(new_arg, tuple) - assert len(new_arg) >= 3 - assert self.args is None - - name = new_arg[0] - dtype = new_arg[1] - dvalue = new_arg[2] - desc = new_arg[3] if len( - new_arg) == 4 else "Description is not provided." - - self.com_g.add_arg(name, dtype, dvalue, desc) - - return self - - def __getattr__(self, name): - if name in self.arg_config: - return self.arg_config[name] - - if name in self.json_config: - return self.json_config[name] - - if name in self.yaml_config: - return self.yaml_config[name] - - raise Warning("The argument %s is not defined." % name) - - def Print(self): - - print("-" * 70) - for name in self.arg_config: - print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name]))) - - for name in self.json_config: - if name not in self.arg_config: - print("%s:\t\t\t\t%s" % - (str(name), str(self.json_config[name]))) - - for name in self.yaml_config: - if name not in self.arg_config: - print("%s:\t\t\t\t%s" % - (str(name), str(self.yaml_config[name]))) - - print("-" * 70) - - -if __name__ == "__main__": - """ - pd_config = PDConfig(json_file = "./test/bert_config.json") - pd_config.build() - - print(pd_config.do_train) - print(pd_config.hidden_size) - - pd_config = PDConfig(yaml_file = "./test/bert_config.yaml") - pd_config.build() - - print(pd_config.do_train) - print(pd_config.hidden_size) - """ - - pd_config = PDConfig(yaml_file="./test/bert_config.yaml") - pd_config += ("my_age", int, 18, "I am forever 18.") - pd_config.build() - - print(pd_config.do_train) - print(pd_config.hidden_size) - print(pd_config.my_age) diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/dist_utils.py b/PaddleNLP/neural_machine_translation/transformer/utils/dist_utils.py deleted file mode 100644 index 503431029f0242d27473ae5d4d95834f99ef0f84..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/utils/dist_utils.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import os -import paddle.fluid as fluid - - -def nccl2_prepare(trainer_id, startup_prog, main_prog): - config = fluid.DistributeTranspilerConfig() - config.mode = "nccl2" - t = fluid.DistributeTranspiler(config=config) - t.transpile( - trainer_id, - trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), - current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), - startup_program=startup_prog, - program=main_prog) - - -def prepare_for_multi_process(exe, build_strategy, train_prog): - # prepare for multi-process - trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) - num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) - if num_trainers < 2: return - print("PADDLE_TRAINERS_NUM", num_trainers) - print("PADDLE_TRAINER_ID", trainer_id) - build_strategy.num_trainers = num_trainers - build_strategy.trainer_id = trainer_id - # NOTE(zcd): use multi processes to train the model, - # and each process use one GPU card. - startup_prog = fluid.Program() - nccl2_prepare(trainer_id, startup_prog, train_prog) - # the startup_prog are run two times, but it doesn't matter. - exe.run(startup_prog) diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/input_field.py b/PaddleNLP/neural_machine_translation/transformer/utils/input_field.py deleted file mode 100644 index de56712399df446baf73707494cb6ec8e7566b25..0000000000000000000000000000000000000000 --- a/PaddleNLP/neural_machine_translation/transformer/utils/input_field.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -from __future__ import division -from __future__ import print_function - -import os -import six -import ast -import copy - -import numpy as np -import paddle.fluid as fluid - - -class Placeholder(object): - def __init__(self): - self.shapes = [] - self.dtypes = [] - self.lod_levels = [] - self.names = [] - - def __init__(self, input_shapes): - - self.shapes = [] - self.dtypes = [] - self.lod_levels = [] - self.names = [] - - for new_holder in input_shapes: - shape = new_holder[0] - dtype = new_holder[1] - lod_level = new_holder[2] if len(new_holder) >= 3 else 0 - name = new_holder[3] if len(new_holder) >= 4 else "" - - self.append_placeholder( - shape, dtype, lod_level=lod_level, name=name) - - def append_placeholder(self, shape, dtype, lod_level=0, name=""): - self.shapes.append(shape) - self.dtypes.append(dtype) - self.lod_levels.append(lod_level) - self.names.append(name) - - def build(self, capacity, reader_name, use_double_buffer=False): - pyreader = fluid.layers.py_reader( - capacity=capacity, - shapes=self.shapes, - dtypes=self.dtypes, - lod_levels=self.lod_levels, - name=reader_name, - use_double_buffer=use_double_buffer) - - return [pyreader, fluid.layers.read_file(pyreader)] - - def __add__(self, new_holder): - assert isinstance(new_holder, tuple) or isinstance(new_holder, list) - assert len(new_holder) >= 2 - - shape = new_holder[0] - dtype = new_holder[1] - lod_level = new_holder[2] if len(new_holder) >= 3 else 0 - name = new_holder[3] if len(new_holder) >= 4 else "" - - self.append_placeholder(shape, dtype, lod_level=lod_level, name=name) - - -class InputField(object): - """ - A high-level API for handling inputs in PaddlePaddle. - """ - - def __init__(self, input_slots=[]): - - self.shapes = [] - self.dtypes = [] - self.names = [] - self.lod_levels = [] - - self.input_slots = {} - self.feed_list_str = [] - self.feed_list = [] - - self.loader = None - - if input_slots: - for input_slot in input_slots: - self += input_slot - - def __add__(self, input_slot): - - if isinstance(input_slot, list) or isinstance(input_slot, tuple): - name = input_slot[0] - shape = input_slot[1] - dtype = input_slot[2] - lod_level = input_slot[3] if len(input_slot) == 4 else 0 - - if isinstance(input_slot, dict): - name = input_slot["name"] - shape = input_slot["shape"] - dtype = input_slot["dtype"] - lod_level = input_slot[ - "lod_level"] if "lod_level" in input_slot else 0 - - self.shapes.append(shape) - self.dtypes.append(dtype) - self.names.append(name) - self.lod_levels.append(lod_level) - - self.feed_list_str.append(name) - - return self - - def __getattr__(self, name): - - if name not in self.input_slots: - raise Warning("the attr %s has not been defined yet." % name) - return None - - return self.input_slots[name] - - def build(self, build_pyreader=False, capacity=100, iterable=False): - - for _name, _shape, _dtype, _lod_level in zip( - self.names, self.shapes, self.dtypes, self.lod_levels): - self.input_slots[_name] = fluid.data( - name=_name, shape=_shape, dtype=_dtype, lod_level=_lod_level) - - for name in self.feed_list_str: - self.feed_list.append(self.input_slots[name]) - - self.loader = fluid.io.DataLoader.from_generator( - feed_list=self.feed_list, - capacity=capacity, - iterable=(not build_pyreader), - use_double_buffer=True) - - -if __name__ == "__main__": - - mnist_input_slots = [{ - "name": "image", - "shape": (-1, 32, 32, 1), - "dtype": "int32" - }, { - "name": "label", - "shape": [-1, 1], - "dtype": "int64" - }] - - input_field = InputField(mnist_input_slots) - - input_field += { - "name": "large_image", - "shape": (-1, 64, 64, 1), - "dtype": "int32" - } - input_field += { - "name": "large_color_image", - "shape": (-1, 64, 64, 3), - "dtype": "int32" - } - - input_field.build() - - print(input_field.feed_list) - - print(input_field.image) - - print(input_field.large_color_image)