From 160b7fa2a5c1af3c33eedc7ece5a12231d6e8730 Mon Sep 17 00:00:00 2001 From: Guo Sheng Date: Mon, 28 Oct 2019 15:53:51 +0800 Subject: [PATCH] Cherry pick paddleMT (#3786) * Update Transformer using Paddle-1.6 apis. (#3614) * Update Transformer using Paddle-1.6 apis. * Add check_version for Paddle-1.6 and reorganize utils including check, configuration, etc. * mv neural_machine_translation to PaddleMT (#3732) * Paddle1.6 transformer fix (#3755) * Revert "mv neural_machine_translation to PaddleMT (#3732)" This reverts commit a46365b6242e9135b76b885c96720b74c8962991. * Delete Transformer unremoved palm dir. * Rename neural_machine_translation as PaddleMT. * Update to PaddleMT in README.md --- .../transformer/.run_ce.sh | 0 .../transformer/README.md | 7 +- .../transformer/__init__.py | 0 .../transformer/_ce.py | 0 .../transformer/desc.py | 18 +- .../transformer/gen_data.sh | 0 .../images/multi_head_attention.png | Bin .../images/transformer_network.png | Bin .../transformer/inference_model.py | 5 +- .../transformer/main.py | 5 +- .../transformer/predict.py | 13 +- .../transformer/reader.py | 16 +- .../transformer/train.py | 18 +- .../transformer/transformer.py | 372 ++++++------------ .../transformer/transformer.yaml | 0 .../transformer/utils}/__init__.py | 0 PaddleNLP/PaddleMT/transformer/utils/check.py | 61 +++ .../transformer/utils}/configure.py | 0 .../transformer/utils}/dist_utils.py | 0 .../transformer/utils}/input_field.py | 19 +- .../transformer/palm/toolkit/__init__.py | 0 21 files changed, 236 insertions(+), 298 deletions(-) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/.run_ce.sh (100%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/README.md (98%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/__init__.py (100%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/_ce.py (100%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/desc.py (91%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/gen_data.sh (100%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/images/multi_head_attention.png (100%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/images/transformer_network.png (100%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/inference_model.py (97%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/main.py (91%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/predict.py (96%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/reader.py (98%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/train.py (96%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/transformer.py (76%) rename PaddleNLP/{neural_machine_translation => PaddleMT}/transformer/transformer.yaml (100%) rename PaddleNLP/{neural_machine_translation/transformer/palm => PaddleMT/transformer/utils}/__init__.py (100%) create mode 100644 PaddleNLP/PaddleMT/transformer/utils/check.py rename PaddleNLP/{neural_machine_translation/transformer/palm/toolkit => PaddleMT/transformer/utils}/configure.py (100%) rename PaddleNLP/{neural_machine_translation/transformer => PaddleMT/transformer/utils}/dist_utils.py (100%) rename PaddleNLP/{neural_machine_translation/transformer/palm/toolkit => PaddleMT/transformer/utils}/input_field.py (92%) delete mode 100644 PaddleNLP/neural_machine_translation/transformer/palm/toolkit/__init__.py diff --git a/PaddleNLP/neural_machine_translation/transformer/.run_ce.sh b/PaddleNLP/PaddleMT/transformer/.run_ce.sh similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/.run_ce.sh rename to PaddleNLP/PaddleMT/transformer/.run_ce.sh diff --git a/PaddleNLP/neural_machine_translation/transformer/README.md b/PaddleNLP/PaddleMT/transformer/README.md similarity index 98% rename from PaddleNLP/neural_machine_translation/transformer/README.md rename to PaddleNLP/PaddleMT/transformer/README.md index 133a3a38..90d47f53 100644 --- a/PaddleNLP/neural_machine_translation/transformer/README.md +++ b/PaddleNLP/PaddleMT/transformer/README.md @@ -5,9 +5,8 @@ ```text . ├── images # README 文档中的图片 -├── palm # 工具包 +├── utils # 工具包 ├── desc.py # 输入描述文件 -├── dist_utils.py # 多进程训练工具 ├── gen_data.sh # 数据生成脚本 ├── inference_model.py # 保存 inference_model 的脚本 ├── main.py # 主程序入口 @@ -33,14 +32,14 @@ 1. paddle安装 - 本项目依赖于 PaddlePaddle Fluid 1.5.0 及以上版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 + 本项目依赖于 PaddlePaddle 1.6及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 2. 下载代码 克隆代码库到本地 ```shell git clone https://github.com/PaddlePaddle/models.git - cd models/PaddleNLP/neural_machine_translation/transformer + cd models/PaddleNLP/PaddleMT/transformer ``` 3. 环境依赖 diff --git a/PaddleNLP/neural_machine_translation/transformer/__init__.py b/PaddleNLP/PaddleMT/transformer/__init__.py similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/__init__.py rename to PaddleNLP/PaddleMT/transformer/__init__.py diff --git a/PaddleNLP/neural_machine_translation/transformer/_ce.py b/PaddleNLP/PaddleMT/transformer/_ce.py similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/_ce.py rename to PaddleNLP/PaddleMT/transformer/_ce.py diff --git a/PaddleNLP/neural_machine_translation/transformer/desc.py b/PaddleNLP/PaddleMT/transformer/desc.py similarity index 91% rename from PaddleNLP/neural_machine_translation/transformer/desc.py rename to PaddleNLP/PaddleMT/transformer/desc.py index d1c40b45..d6c34191 100644 --- a/PaddleNLP/neural_machine_translation/transformer/desc.py +++ b/PaddleNLP/PaddleMT/transformer/desc.py @@ -15,9 +15,9 @@ # The placeholder for batch_size in compile time. Must be -1 currently to be # consistent with some ops' infer-shape output in compile time, such as the # sequence_expand op used in beamsearch decoder. -batch_size = -1 +batch_size = None # The placeholder for squence length in compile time. -seq_len = 256 +seq_len = None # The placeholder for head number in compile time. n_head = 8 # The placeholder for model dim in compile time. @@ -27,11 +27,11 @@ d_model = 512 # compile time. input_descs = { # The actual data shape of src_word is: - # [batch_size, max_src_len_in_batch, 1] - "src_word": [(batch_size, seq_len, 1), "int64", 2], + # [batch_size, max_src_len_in_batch] + "src_word": [(batch_size, seq_len), "int64", 2], # The actual data shape of src_pos is: # [batch_size, max_src_len_in_batch, 1] - "src_pos": [(batch_size, seq_len, 1), "int64"], + "src_pos": [(batch_size, seq_len), "int64"], # This input is used to remove attention weights on paddings in the # encoder. # The actual data shape of src_slf_attn_bias is: @@ -39,11 +39,11 @@ input_descs = { "src_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"], # The actual data shape of trg_word is: # [batch_size, max_trg_len_in_batch, 1] - "trg_word": [(batch_size, seq_len, 1), "int64", + "trg_word": [(batch_size, seq_len), "int64", 2], # lod_level is only used in fast decoder. # The actual data shape of trg_pos is: # [batch_size, max_trg_len_in_batch, 1] - "trg_pos": [(batch_size, seq_len, 1), "int64"], + "trg_pos": [(batch_size, seq_len), "int64"], # This input is used to remove attention weights on paddings and # subsequent words in the decoder. # The actual data shape of trg_slf_attn_bias is: @@ -60,11 +60,11 @@ input_descs = { "enc_output": [(batch_size, seq_len, d_model), "float32"], # The actual data shape of label_word is: # [batch_size * max_trg_len_in_batch, 1] - "lbl_word": [(batch_size * seq_len, 1), "int64"], + "lbl_word": [(None, 1), "int64"], # This input is used to mask out the loss of paddding tokens. # The actual data shape of label_weight is: # [batch_size * max_trg_len_in_batch, 1] - "lbl_weight": [(batch_size * seq_len, 1), "float32"], + "lbl_weight": [(None, 1), "float32"], # This input is used in beam-search decoder. "init_score": [(batch_size, 1), "float32", 2], # This input is used in beam-search decoder for the first gather diff --git a/PaddleNLP/neural_machine_translation/transformer/gen_data.sh b/PaddleNLP/PaddleMT/transformer/gen_data.sh similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/gen_data.sh rename to PaddleNLP/PaddleMT/transformer/gen_data.sh diff --git a/PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png b/PaddleNLP/PaddleMT/transformer/images/multi_head_attention.png similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png rename to PaddleNLP/PaddleMT/transformer/images/multi_head_attention.png diff --git a/PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png b/PaddleNLP/PaddleMT/transformer/images/transformer_network.png similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png rename to PaddleNLP/PaddleMT/transformer/images/transformer_network.png diff --git a/PaddleNLP/neural_machine_translation/transformer/inference_model.py b/PaddleNLP/PaddleMT/transformer/inference_model.py similarity index 97% rename from PaddleNLP/neural_machine_translation/transformer/inference_model.py rename to PaddleNLP/PaddleMT/transformer/inference_model.py index d1b88f5b..40fc7ede 100644 --- a/PaddleNLP/neural_machine_translation/transformer/inference_model.py +++ b/PaddleNLP/PaddleMT/transformer/inference_model.py @@ -22,9 +22,8 @@ import numpy as np import paddle import paddle.fluid as fluid -#include palm for easier nlp coding -from palm.toolkit.input_field import InputField -from palm.toolkit.configure import PDConfig +from utils.input_field import InputField +from utils.configure import PDConfig # include task-specific libs import desc diff --git a/PaddleNLP/neural_machine_translation/transformer/main.py b/PaddleNLP/PaddleMT/transformer/main.py similarity index 91% rename from PaddleNLP/neural_machine_translation/transformer/main.py rename to PaddleNLP/PaddleMT/transformer/main.py index 6ff929af..feaf29ba 100644 --- a/PaddleNLP/neural_machine_translation/transformer/main.py +++ b/PaddleNLP/PaddleMT/transformer/main.py @@ -20,13 +20,12 @@ import numpy as np import paddle import paddle.fluid as fluid -#include palm for easier nlp coding -from palm.toolkit.configure import PDConfig - +from utils.configure import PDConfig from train import do_train from predict import do_predict from inference_model import do_save_inference_model + if __name__ == "__main__": LOG_FORMAT = "[%(asctime)s %(levelname)s %(filename)s:%(lineno)d] %(message)s" logging.basicConfig( diff --git a/PaddleNLP/neural_machine_translation/transformer/predict.py b/PaddleNLP/PaddleMT/transformer/predict.py similarity index 96% rename from PaddleNLP/neural_machine_translation/transformer/predict.py rename to PaddleNLP/PaddleMT/transformer/predict.py index d896517f..7ad847fd 100644 --- a/PaddleNLP/neural_machine_translation/transformer/predict.py +++ b/PaddleNLP/PaddleMT/transformer/predict.py @@ -22,9 +22,9 @@ import numpy as np import paddle import paddle.fluid as fluid -#include palm for easier nlp coding -from palm.toolkit.input_field import InputField -from palm.toolkit.configure import PDConfig +from utils.input_field import InputField +from utils.configure import PDConfig +from utils.check import check_gpu, check_version # include task-specific libs import desc @@ -149,7 +149,6 @@ def do_predict(args): is_training=False, model_input=input_field, args=args) out_ids, out_scores = predictions - out_ids.persistable = out_scores.persistable = True # This is used here to set dropout to the test mode. test_prog = test_prog.clone(for_test=True) @@ -185,8 +184,8 @@ def do_predict(args): f = open(args.output_file, "wb") # start predicting ## decorate the pyreader with batch_generator - input_field.reader.decorate_batch_generator(batch_generator) - input_field.reader.start() + input_field.loader.set_batch_generator(batch_generator) + input_field.loader.start() while True: try: seq_ids, seq_scores = exe.run( @@ -231,5 +230,7 @@ if __name__ == "__main__": args = PDConfig(yaml_file="./transformer.yaml") args.build() args.Print() + check_gpu(args.use_cuda) + check_version() do_predict(args) diff --git a/PaddleNLP/neural_machine_translation/transformer/reader.py b/PaddleNLP/PaddleMT/transformer/reader.py similarity index 98% rename from PaddleNLP/neural_machine_translation/transformer/reader.py rename to PaddleNLP/PaddleMT/transformer/reader.py index e3be69b7..e69b4a25 100644 --- a/PaddleNLP/neural_machine_translation/transformer/reader.py +++ b/PaddleNLP/PaddleMT/transformer/reader.py @@ -84,12 +84,12 @@ def prepare_train_input(insts, src_pad_idx, trg_pad_idx, n_head): """ src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) - src_word = src_word.reshape(-1, src_max_len, 1) - src_pos = src_pos.reshape(-1, src_max_len, 1) + src_word = src_word.reshape(-1, src_max_len) + src_pos = src_pos.reshape(-1, src_max_len) trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) - trg_word = trg_word.reshape(-1, trg_max_len, 1) - trg_pos = trg_pos.reshape(-1, trg_max_len, 1) + trg_word = trg_word.reshape(-1, trg_max_len) + trg_pos = trg_pos.reshape(-1, trg_max_len) trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]).astype("float32") @@ -103,6 +103,8 @@ def prepare_train_input(insts, src_pad_idx, trg_pad_idx, n_head): return_attn_bias=False, return_max_len=False, return_num_token=True) + lbl_word = lbl_word.reshape(-1, 1) + lbl_weight = lbl_weight.reshape(-1, 1) data_inputs = [ src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, @@ -122,9 +124,9 @@ def prepare_infer_input(insts, src_pad_idx, bos_idx, n_head, place): trg_word = np.asarray([[bos_idx]] * len(insts), dtype="int64") trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, 1, 1]).astype("float32") - trg_word = trg_word.reshape(-1, 1, 1) - src_word = src_word.reshape(-1, src_max_len, 1) - src_pos = src_pos.reshape(-1, src_max_len, 1) + trg_word = trg_word.reshape(-1, 1) + src_word = src_word.reshape(-1, src_max_len) + src_pos = src_pos.reshape(-1, src_max_len) def to_lodtensor(data, place, lod=None): data_tensor = fluid.LoDTensor() diff --git a/PaddleNLP/neural_machine_translation/transformer/train.py b/PaddleNLP/PaddleMT/transformer/train.py similarity index 96% rename from PaddleNLP/neural_machine_translation/transformer/train.py rename to PaddleNLP/PaddleMT/transformer/train.py index 57fa3b91..48b4847f 100644 --- a/PaddleNLP/neural_machine_translation/transformer/train.py +++ b/PaddleNLP/PaddleMT/transformer/train.py @@ -22,13 +22,13 @@ import numpy as np import paddle import paddle.fluid as fluid -#include palm for easier nlp coding -from palm.toolkit.input_field import InputField -from palm.toolkit.configure import PDConfig +import utils.dist_utils as dist_utils +from utils.input_field import InputField +from utils.configure import PDConfig +from utils.check import check_gpu, check_version # include task-specific libs import desc -import dist_utils import reader from transformer import create_net, position_encoding_init @@ -188,8 +188,6 @@ def do_train(args): sum_cost, avg_cost, token_num = create_net( is_training=True, model_input=input_field, args=args) - sum_cost.persistable = avg_cost.persistable = token_num.persistable = True - # define the optimizer with fluid.default_main_program()._lr_schedule_guard(): @@ -206,7 +204,7 @@ def do_train(args): # prepare training ## decorate the pyreader with batch_generator - input_field.reader.decorate_batch_generator(batch_generator) + input_field.loader.set_batch_generator(batch_generator) ## define the executor and program for training @@ -254,7 +252,7 @@ def do_train(args): step_idx = 0 for pass_id in range(args.epoch): pass_start_time = time.time() - input_field.reader.start() + input_field.loader.start() batch_id = 0 while True: @@ -303,7 +301,7 @@ def do_train(args): step_idx += 1 except fluid.core.EOFException: - input_field.reader.reset() + input_field.loader.reset() break time_consumed = time.time() - pass_start_time @@ -323,5 +321,7 @@ if __name__ == "__main__": args = PDConfig(yaml_file="./transformer.yaml") args.build() args.Print() + check_gpu(args.use_cuda) + check_version() do_train(args) diff --git a/PaddleNLP/neural_machine_translation/transformer/transformer.py b/PaddleNLP/PaddleMT/transformer/transformer.py similarity index 76% rename from PaddleNLP/neural_machine_translation/transformer/transformer.py rename to PaddleNLP/PaddleMT/transformer/transformer.py index f21c6295..be20001b 100644 --- a/PaddleNLP/neural_machine_translation/transformer/transformer.py +++ b/PaddleNLP/PaddleMT/transformer/transformer.py @@ -297,20 +297,19 @@ def prepare_encoder_decoder(src_word, [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ - src_word_emb = layers.embedding( + src_word_emb = fluid.embedding( src_word, size=[src_vocab_size, src_emb_dim], padding_idx=bos_idx, # set embedding of bos to 0 - param_attr=fluid.ParamAttr( - name=word_emb_param_name, - initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) + param_attr=fluid.ParamAttr(name=word_emb_param_name, + initializer=fluid.initializer.Normal( + 0., src_emb_dim**-0.5))) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) - src_pos_enc = layers.embedding( - src_pos, - size=[src_max_len, src_emb_dim], - param_attr=fluid.ParamAttr( - name=pos_enc_param_name, trainable=False)) + src_pos_enc = fluid.embedding(src_pos, + size=[src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, trainable=False)) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc return layers.dropout( @@ -506,38 +505,8 @@ def decoder(dec_input, return dec_output -def make_all_inputs(input_fields): - """ - Define the input data layers for the transformer model. - """ - inputs = [] - for input_field in input_fields: - input_var = layers.data( - name=input_field, - shape=input_descs[input_field][0], - dtype=input_descs[input_field][1], - lod_level=input_descs[input_field][2] - if len(input_descs[input_field]) == 3 else 0, - append_batch_size=False) - inputs.append(input_var) - return inputs - - -def make_all_py_reader_inputs(input_fields, is_test=False): - reader = layers.py_reader( - capacity=20, - name="test_reader" if is_test else "train_reader", - shapes=[input_descs[input_field][0] for input_field in input_fields], - dtypes=[input_descs[input_field][1] for input_field in input_fields], - lod_levels=[ - input_descs[input_field][2] - if len(input_descs[input_field]) == 3 else 0 - for input_field in input_fields - ]) - return layers.read_file(reader), reader - - -def transformer(src_vocab_size, +def transformer(model_input, + src_vocab_size, trg_vocab_size, max_length, n_layer, @@ -554,96 +523,76 @@ def transformer(src_vocab_size, weight_sharing, label_smooth_eps, bos_idx=0, - use_py_reader=False, - is_test=False, - model_input=None): + is_test=False): if weight_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) - if model_input: - enc_inputs = (model_input.src_word, model_input.src_pos, - model_input.src_slf_attn_bias) - dec_inputs = (model_input.trg_word, model_input.trg_pos, - model_input.trg_slf_attn_bias, - model_input.trg_src_attn_bias) - label = model_input.lbl_word - weights = model_input.lbl_weight - else: - data_input_names = encoder_data_input_fields + \ - decoder_data_input_fields[:-1] + label_data_input_fields - - if use_py_reader: - all_inputs, reader = make_all_py_reader_inputs(data_input_names, - is_test) - else: - all_inputs = make_all_inputs(data_input_names) - - enc_inputs_len = len(encoder_data_input_fields) - dec_inputs_len = len(decoder_data_input_fields[:-1]) - enc_inputs = all_inputs[0:enc_inputs_len] - dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] - label = all_inputs[-2] - weights = all_inputs[-1] - - enc_output = wrap_encoder( - src_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_inputs, - bos_idx=bos_idx) - - predict = wrap_decoder( - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - dec_inputs, - enc_output, ) + enc_inputs = (model_input.src_word, model_input.src_pos, + model_input.src_slf_attn_bias) + dec_inputs = (model_input.trg_word, model_input.trg_pos, + model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias) + label = model_input.lbl_word + weights = model_input.lbl_weight + + enc_output = wrap_encoder(enc_inputs, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=bos_idx) + + predict = wrap_decoder(dec_inputs, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_output=enc_output) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if label_smooth_eps: - label = layers.label_smooth( - label=layers.one_hot( - input=label, depth=trg_vocab_size), - epsilon=label_smooth_eps) + # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing + # the enforcement that the last dimension of label must be 1. + label = layers.label_smooth(label=layers.one_hot(input=label, + depth=trg_vocab_size), + epsilon=label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if label_smooth_eps else False) - weighted_cost = cost * weights + weighted_cost = layers.elementwise_mul(x=cost, y=weights, axis=0) sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num - return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None + return sum_cost, avg_cost, predict, token_num -def wrap_encoder(src_vocab_size, +def wrap_encoder(enc_inputs, + src_vocab_size, max_length, n_layer, n_head, @@ -657,17 +606,11 @@ def wrap_encoder(src_vocab_size, preprocess_cmd, postprocess_cmd, weight_sharing, - enc_inputs=None, bos_idx=0): """ The wrapper assembles together all needed layers for the encoder. """ - if enc_inputs is None: - # This is used to implement independent encoder program in inference. - src_word, src_pos, src_slf_attn_bias = make_all_inputs( - encoder_data_input_fields) - else: - src_word, src_pos, src_slf_attn_bias = enc_inputs + src_word, src_pos, src_slf_attn_bias = enc_inputs enc_input = prepare_encoder( src_word, src_pos, @@ -694,7 +637,8 @@ def wrap_encoder(src_vocab_size, return enc_output -def wrap_decoder(trg_vocab_size, +def wrap_decoder(dec_inputs, + trg_vocab_size, max_length, n_layer, n_head, @@ -708,7 +652,6 @@ def wrap_decoder(trg_vocab_size, preprocess_cmd, postprocess_cmd, weight_sharing, - dec_inputs=None, enc_output=None, caches=None, gather_idx=None, @@ -716,12 +659,7 @@ def wrap_decoder(trg_vocab_size, """ The wrapper assembles together all needed layers for the decoder. """ - if dec_inputs is None: - # This is used to implement independent decoder program in inference. - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \ - make_all_inputs(decoder_data_input_fields) - else: - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs dec_input = prepare_decoder( trg_word, @@ -770,66 +708,36 @@ def wrap_decoder(trg_vocab_size, return predict -def fast_decode(src_vocab_size, - trg_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - beam_size, - max_out_len, - bos_idx, - eos_idx, - use_py_reader=False, - model_input=None): +def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len, + n_layer, n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, relu_dropout, + preprocess_cmd, postprocess_cmd, weight_sharing, beam_size, + max_out_len, bos_idx, eos_idx): """ Use beam search to decode. Caches will be used to store states of history steps which can make the decoding faster. """ - if model_input: - enc_inputs = (model_input.src_word, model_input.src_pos, - model_input.src_slf_attn_bias) - dec_inputs = (model_input.trg_word, model_input.init_score, - model_input.init_idx, model_input.trg_src_attn_bias) - else: - data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields - - if use_py_reader: - all_inputs, reader = make_all_py_reader_inputs(data_input_names) - else: - all_inputs = make_all_inputs(data_input_names) - - enc_inputs_len = len(encoder_data_input_fields) - dec_inputs_len = len(fast_decoder_data_input_fields) - enc_inputs = all_inputs[0:enc_inputs_len] - dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] - - enc_output = wrap_encoder( - src_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - enc_inputs, - bos_idx=bos_idx) + enc_inputs = (model_input.src_word, model_input.src_pos, + model_input.src_slf_attn_bias) + dec_inputs = (model_input.trg_word, model_input.init_score, + model_input.init_idx, model_input.trg_src_attn_bias) + + enc_output = wrap_encoder(enc_inputs, + src_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=bos_idx) start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs def beam_search(): @@ -875,7 +783,7 @@ def fast_decode(src_vocab_size, pre_ids = layers.array_read(array=ids, i=step_idx) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. - pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) + # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_attn_bias = layers.gather( @@ -884,30 +792,29 @@ def fast_decode(src_vocab_size, x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, - shape=[-1, 1, 1], + shape=[-1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) - logits = wrap_decoder( - trg_vocab_size, - max_in_len, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), - enc_output=enc_output, - caches=caches, - gather_idx=parent_idx, - bos_idx=bos_idx) + logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias), + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + enc_output=enc_output, + caches=caches, + gather_idx=parent_idx, + bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) @@ -941,51 +848,26 @@ def fast_decode(src_vocab_size, return finished_ids, finished_scores finished_ids, finished_scores = beam_search() - return finished_ids, finished_scores, reader if use_py_reader else None + return finished_ids, finished_scores def create_net(is_training, model_input, args): if is_training: - sum_cost, avg_cost, _, token_num, _ = transformer( - args.src_vocab_size, - args.trg_vocab_size, - args.max_length + 1, - args.n_layer, - args.n_head, - args.d_key, - args.d_value, - args.d_model, - args.d_inner_hid, - args.prepostprocess_dropout, - args.attention_dropout, - args.relu_dropout, - args.preprocess_cmd, - args.postprocess_cmd, - args.weight_sharing, - args.label_smooth_eps, - args.bos_idx, - model_input=model_input) + sum_cost, avg_cost, _, token_num = transformer( + model_input, args.src_vocab_size, args.trg_vocab_size, + args.max_length + 1, args.n_layer, args.n_head, args.d_key, + args.d_value, args.d_model, args.d_inner_hid, + args.prepostprocess_dropout, args.attention_dropout, + args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, + args.weight_sharing, args.label_smooth_eps, args.bos_idx) return sum_cost, avg_cost, token_num else: - out_ids, out_scores, _ = fast_decode( - args.src_vocab_size, - args.trg_vocab_size, - args.max_length + 1, - args.n_layer, - args.n_head, - args.d_key, - args.d_value, - args.d_model, - args.d_inner_hid, - args.prepostprocess_dropout, - args.attention_dropout, - args.relu_dropout, - args.preprocess_cmd, - args.postprocess_cmd, - args.weight_sharing, - args.beam_size, - args.max_out_len, - args.bos_idx, - args.eos_idx, - model_input=model_input) + out_ids, out_scores = fast_decode( + model_input, args.src_vocab_size, args.trg_vocab_size, + args.max_length + 1, args.n_layer, args.n_head, args.d_key, + args.d_value, args.d_model, args.d_inner_hid, + args.prepostprocess_dropout, args.attention_dropout, + args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, + args.weight_sharing, args.beam_size, args.max_out_len, args.bos_idx, + args.eos_idx) return out_ids, out_scores diff --git a/PaddleNLP/neural_machine_translation/transformer/transformer.yaml b/PaddleNLP/PaddleMT/transformer/transformer.yaml similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/transformer.yaml rename to PaddleNLP/PaddleMT/transformer/transformer.yaml diff --git a/PaddleNLP/neural_machine_translation/transformer/palm/__init__.py b/PaddleNLP/PaddleMT/transformer/utils/__init__.py similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/palm/__init__.py rename to PaddleNLP/PaddleMT/transformer/utils/__init__.py diff --git a/PaddleNLP/PaddleMT/transformer/utils/check.py b/PaddleNLP/PaddleMT/transformer/utils/check.py new file mode 100644 index 00000000..305fa370 --- /dev/null +++ b/PaddleNLP/PaddleMT/transformer/utils/check.py @@ -0,0 +1,61 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle.fluid as fluid + +import logging +logger = logging.getLogger(__name__) + +__all__ = ['check_gpu', 'check_version'] + + +def check_gpu(use_gpu): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config use_gpu cannot be set as true while you are " \ + "using paddlepaddle cpu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ + "\t2. Set use_gpu as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and not fluid.is_compiled_with_cuda(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) diff --git a/PaddleNLP/neural_machine_translation/transformer/palm/toolkit/configure.py b/PaddleNLP/PaddleMT/transformer/utils/configure.py similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/palm/toolkit/configure.py rename to PaddleNLP/PaddleMT/transformer/utils/configure.py diff --git a/PaddleNLP/neural_machine_translation/transformer/dist_utils.py b/PaddleNLP/PaddleMT/transformer/utils/dist_utils.py similarity index 100% rename from PaddleNLP/neural_machine_translation/transformer/dist_utils.py rename to PaddleNLP/PaddleMT/transformer/utils/dist_utils.py diff --git a/PaddleNLP/neural_machine_translation/transformer/palm/toolkit/input_field.py b/PaddleNLP/PaddleMT/transformer/utils/input_field.py similarity index 92% rename from PaddleNLP/neural_machine_translation/transformer/palm/toolkit/input_field.py rename to PaddleNLP/PaddleMT/transformer/utils/input_field.py index c34cb870..de567123 100644 --- a/PaddleNLP/neural_machine_translation/transformer/palm/toolkit/input_field.py +++ b/PaddleNLP/PaddleMT/transformer/utils/input_field.py @@ -93,7 +93,7 @@ class InputField(object): self.feed_list_str = [] self.feed_list = [] - self.reader = None + self.loader = None if input_slots: for input_slot in input_slots: @@ -135,22 +135,17 @@ class InputField(object): for _name, _shape, _dtype, _lod_level in zip( self.names, self.shapes, self.dtypes, self.lod_levels): - self.input_slots[_name] = fluid.layers.data( + self.input_slots[_name] = fluid.data( name=_name, shape=_shape, dtype=_dtype, lod_level=_lod_level) for name in self.feed_list_str: self.feed_list.append(self.input_slots[name]) - if build_pyreader: - self.reader = fluid.io.PyReader( - feed_list=self.feed_list, capacity=capacity, iterable=iterable) - - def start(self, generator=None): - - if generator is not None: - self.reader.decorate_batch_generator(generator) - - self.reader.start() + self.loader = fluid.io.DataLoader.from_generator( + feed_list=self.feed_list, + capacity=capacity, + iterable=(not build_pyreader), + use_double_buffer=True) if __name__ == "__main__": diff --git a/PaddleNLP/neural_machine_translation/transformer/palm/toolkit/__init__.py b/PaddleNLP/neural_machine_translation/transformer/palm/toolkit/__init__.py deleted file mode 100644 index e69de29b..00000000 -- GitLab