diff --git a/dygraph/seq2seq/README.md b/dygraph/seq2seq/README.md new file mode 100755 index 0000000000000000000000000000000000000000..94072c7fe5142038b348f26ba9564e6d5c8d445c --- /dev/null +++ b/dygraph/seq2seq/README.md @@ -0,0 +1,127 @@ +运行本目录下的范例模型需要安装PaddlePaddle Fluid 1.7版。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。 + +# Sequence to Sequence (Seq2Seq) + +以下是本范例模型的简要目录结构及说明: + +``` +. +├── README.md # 文档,本文件 +├── args.py # 训练、预测以及模型参数配置程序 +├── reader.py # 数据读入程序 +├── download.py # 数据下载程序 +├── train.py # 训练主程序 +├── infer.py # 预测主程序 +├── run.sh # 默认配置的启动脚本 +├── infer.sh # 默认配置的解码脚本 +├── attention_model.py # 带注意力机制的翻译模型程序 +└── base_model.py # 无注意力机制的翻译模型程序 +``` + +## 简介 + +Sequence to Sequence (Seq2Seq),使用编码器-解码器(Encoder-Decoder)结构,用编码器将源序列编码成vector,再用解码器将该vector解码为目标序列。Seq2Seq 广泛应用于机器翻译,自动对话机器人,文档摘要自动生成,图片描述自动生成等任务中。 + +本目录包含Seq2Seq的一个经典样例:机器翻译,实现了一个base model(不带attention机制),一个带attention机制的翻译模型。Seq2Seq翻译模型,模拟了人类在进行翻译类任务时的行为:先解析源语言,理解其含义,再根据该含义来写出目标语言的语句。更多关于机器翻译的具体原理和数学表达式,我们推荐参考[深度学习101](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/basics/machine_translation/index.html)。 + +**本目录旨在展示如何用Paddle Fluid 1.7的动态图接口实现一个标准的Seq2Seq模型** ,相同网络结构的静态图实现可以参照 [Seq2Seq](https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/PaddleTextGEN/seq2seq)。 + +## 模型概览 + +本模型中,在编码器方面,我们采用了基于LSTM的多层的RNN encoder;在解码器方面,我们使用了带注意力(Attention)机制的RNN decoder,并同时提供了一个不带注意力机制的解码器实现作为对比。在预测时我们使用柱搜索(beam search)算法来生成翻译的目标语句。以下将分别介绍用到的这些方法。 + +## 数据介绍 + +本教程使用[IWSLT'15 English-Vietnamese data ](https://nlp.stanford.edu/projects/nmt/)数据集中的英语到越南语的数据作为训练语料,tst2012的数据作为开发集,tst2013的数据作为测试集 + +### 数据获取 + +``` +python download.py +``` + +## 模型训练 + +`run.sh`包含训练程序的主函数,要使用默认参数开始训练,只需要简单地执行: + +``` +sh run.sh +``` + +默认使用带有注意力机制的RNN模型,可以通过修改 --attention 为False来训练不带注意力机制的RNN模型。 + +``` +python train.py \ + --src_lang en --tar_lang vi \ + --attention True \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --train_data_prefix data/en-vi/train \ + --eval_data_prefix data/en-vi/tst2012 \ + --test_data_prefix data/en-vi/tst2013 \ + --vocab_prefix data/en-vi/vocab \ + --use_gpu True \ + --model_path ./attention_models +``` + +训练程序会在每个epoch训练结束之后,save一次模型。 + +## 模型预测 + +当模型训练完成之后, 可以利用infer.sh的脚本进行预测,默认使用beam search的方法进行预测,加载第10个epoch的模型进行预测,对test的数据集进行解码 + +``` +sh infer.sh +``` + +如果想预测别的数据文件,只需要将 --infer_file参数进行修改。 + +``` +python infer.py \ + --attention True \ + --src_lang en --tar_lang vi \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --vocab_prefix data/en-vi/vocab \ + --infer_file data/en-vi/tst2013.en \ + --reload_model attention_models/epoch_10 \ + --infer_output_file attention_infer_output/infer_output.txt \ + --beam_size 10 \ + --use_gpu True +``` + +## 效果评价 + +使用 [*multi-bleu.perl*](https://github.com/moses-smt/mosesdecoder.git) 工具来评价模型预测的翻译质量,使用方法如下: + +```sh +mosesdecoder/scripts/generic/multi-bleu.perl tst2013.vi < infer_output.txt +``` + +每个模型分别训练了10次,单次取第10个epoch保存的模型进行预测,取beam_size=10。效果如下(为了便于观察,对10次结果按照升序进行了排序): + +``` +> no attention +tst2012 BLEU: +[10.75 10.85 10.9 10.94 10.97 11.01 11.01 11.04 11.13 11.4] +tst2013 BLEU: +[10.71 10.71 10.74 10.76 10.91 10.94 11.02 11.16 11.21 11.44] + +> with attention +tst2012 BLEU: +[21.14 22.34 22.54 22.65 22.71 22.71 23.08 23.15 23.3 23.4] +tst2013 BLEU: +[23.41 24.79 25.11 25.12 25.19 25.24 25.39 25.61 25.61 25.63] +``` diff --git a/dygraph/seq2seq/__init__.py b/dygraph/seq2seq/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dygraph/seq2seq/args.py b/dygraph/seq2seq/args.py new file mode 100755 index 0000000000000000000000000000000000000000..99f21b0800d9a2696e245fc807b393308a98e09a --- /dev/null +++ b/dygraph/seq2seq/args.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--train_data_prefix", type=str, help="file prefix for train data") + parser.add_argument( + "--eval_data_prefix", type=str, help="file prefix for eval data") + parser.add_argument( + "--test_data_prefix", type=str, help="file prefix for test data") + parser.add_argument( + "--vocab_prefix", type=str, help="file prefix for vocab") + parser.add_argument("--src_lang", type=str, help="source language suffix") + parser.add_argument("--tar_lang", type=str, help="target language suffix") + + parser.add_argument( + "--attention", + type=eval, + default=False, + help="Whether use attention model") + + parser.add_argument( + "--optimizer", + type=str, + default='adam', + help="optimizer to use, only supprt[sgd|adam]") + + parser.add_argument( + "--learning_rate", + type=float, + default=0.001, + help="learning rate for optimizer") + + parser.add_argument( + "--num_layers", + type=int, + default=1, + help="layers number of encoder and decoder") + parser.add_argument( + "--hidden_size", + type=int, + default=100, + help="hidden size of encoder and decoder") + parser.add_argument("--src_vocab_size", type=int, help="source vocab size") + parser.add_argument("--tar_vocab_size", type=int, help="target vocab size") + + parser.add_argument( + "--batch_size", type=int, help="batch size of each step") + + parser.add_argument( + "--max_epoch", type=int, default=12, help="max epoch for the training") + + parser.add_argument( + "--max_len", + type=int, + default=50, + help="max length for source and target sentence") + parser.add_argument( + "--dropout", type=float, default=0.0, help="drop probability") + parser.add_argument( + "--init_scale", + type=float, + default=0.0, + help="init scale for parameter") + parser.add_argument( + "--max_grad_norm", + type=float, + default=5.0, + help="max grad norm for global norm clip") + + parser.add_argument( + "--model_path", + type=str, + default='model', + help="model path for model to save") + + parser.add_argument( + "--reload_model", type=str, help="reload model to inference") + + parser.add_argument( + "--infer_file", type=str, help="file name for inference") + parser.add_argument( + "--infer_output_file", + type=str, + default='infer_output', + help="file name for inference output") + parser.add_argument( + "--beam_size", type=int, default=10, help="file name for inference") + + parser.add_argument( + '--use_gpu', + type=eval, + default=False, + help='Whether using gpu [True|False]') + + parser.add_argument( + "--enable_ce", + action='store_true', + help="The flag indicating whether to run the task " + "for continuous evaluation.") + + parser.add_argument( + "--profile", action='store_true', help="Whether enable the profile.") + # NOTE: profiler args, used for benchmark + parser.add_argument( + "--profiler_path", + type=str, + default='./seq2seq.profile', + help="the profiler output file path. (used for benchmark)") + args = parser.parse_args() + return args diff --git a/dygraph/seq2seq/attention_model.py b/dygraph/seq2seq/attention_model.py new file mode 100755 index 0000000000000000000000000000000000000000..09ecfed58e70c485f3f1e962e88cd3dbda8e19f6 --- /dev/null +++ b/dygraph/seq2seq/attention_model.py @@ -0,0 +1,362 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.fluid as fluid +import numpy as np +from paddle.fluid import ParamAttr +from paddle.fluid.dygraph.base import to_variable +from paddle.fluid.dygraph.nn import Embedding, FC +from rnn import BasicLSTMUnit +import numpy as np + +INF = 1. * 1e5 +alpha = 0.6 +uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) +zero_constant = fluid.initializer.Constant(0.0) + +class AttentionModel(fluid.dygraph.Layer): + def __init__(self, + hidden_size, + src_vocab_size, + tar_vocab_size, + batch_size, + num_layers=1, + init_scale=0.1, + dropout=None, + beam_size=1, + beam_start_token=1, + beam_end_token=2, + beam_max_step_num=100, + mode='train'): + super(AttentionModel, self).__init__() + self.hidden_size = hidden_size + self.src_vocab_size = src_vocab_size + self.tar_vocab_size = tar_vocab_size + self.batch_size = batch_size + self.num_layers = num_layers + self.init_scale = init_scale + self.dropout = dropout + self.beam_size = beam_size + self.beam_start_token = beam_start_token + self.beam_end_token = beam_end_token + self.beam_max_step_num = beam_max_step_num + self.mode = mode + self.kinf = 1e9 + + param_attr = ParamAttr(initializer=uniform_initializer(self.init_scale)) + bias_attr = ParamAttr(initializer=zero_constant) + forget_bias = 1.0 + + self.src_embeder = Embedding( + size=[self.src_vocab_size, self.hidden_size], + param_attr=fluid.ParamAttr( + name='source_embedding', + initializer=uniform_initializer(init_scale))) + + self.tar_embeder = Embedding( + size=[self.tar_vocab_size, self.hidden_size], + is_sparse=False, + param_attr=fluid.ParamAttr( + name='target_embedding', + initializer=uniform_initializer(init_scale))) + + self.enc_units = [] + for i in range(num_layers): + self.enc_units.append( + self.add_sublayer("enc_units_%d" % i, + BasicLSTMUnit( + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) + + self.dec_units = [] + for i in range(num_layers): + if i == 0: + self.dec_units.append( + self.add_sublayer("dec_units_%d" % i, + BasicLSTMUnit( + hidden_size=self.hidden_size, + input_size=self.hidden_size * 2, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) + else: + self.dec_units.append( + self.add_sublayer("dec_units_%d" % i, + BasicLSTMUnit( + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) + + self.fc = fluid.dygraph.nn.Linear(self.hidden_size, + self.tar_vocab_size, + param_attr=param_attr, + bias_attr=False) + + self.attn_fc = fluid.dygraph.nn.Linear(self.hidden_size, + self.hidden_size, + param_attr=param_attr, + bias_attr=False) + + self.concat_fc = fluid.dygraph.nn.Linear(2 * self.hidden_size, + self.hidden_size, + param_attr=param_attr, + bias_attr=False) + + def _transpose_batch_time(self, x): + return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) + + def _merge_batch_beams(self, x): + return fluid.layers.reshape(x, shape=(-1,x.shape[2])) + + def tile_beam_merge_with_batch(self, x): + x = fluid.layers.unsqueeze(x, [1]) # [batch_size, 1, ...] + expand_times = [1] * len(x.shape) + expand_times[1] = self.beam_size + x = fluid.layers.expand(x, expand_times) # [batch_size, beam_size, ...] + x = fluid.layers.transpose(x, list(range(2, len(x.shape))) + + [0, 1]) # [..., batch_size, beam_size] + # use 0 to copy to avoid wrong shape + x = fluid.layers.reshape( + x, shape=[0] * + (len(x.shape) - 2) + [-1]) # [..., batch_size * beam_size] + x = fluid.layers.transpose( + x, [len(x.shape) - 1] + + list(range(0, len(x.shape) - 1))) # [batch_size * beam_size, ...] + return x + + def _split_batch_beams(self, x): + return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1])) + + def _expand_to_beam_size(self, x): + x = fluid.layers.unsqueeze(x, [1]) + expand_times = [1] * len(x.shape) + expand_times[1] = self.beam_size + x = fluid.layers.expand(x, expand_times) + return x + + def _real_state(self, state, new_state, step_mask): + new_state = fluid.layers.elementwise_mul(new_state, step_mask, axis=0) - \ + fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0) + return new_state + + def _gather(self, x, indices, batch_pos): + topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) + return fluid.layers.gather_nd(x, topk_coordinates) + + def attention(self, query, enc_output, mask=None): + query = fluid.layers.unsqueeze(query, [1]) + memory = self.attn_fc(enc_output) + attn = fluid.layers.matmul(query, memory, transpose_y=True) + + if mask: + attn = fluid.layers.transpose(attn, [1, 0, 2]) + attn = fluid.layers.elementwise_add(attn, mask * 1000000000, -1) + attn = fluid.layers.transpose(attn, [1, 0, 2]) + weight = fluid.layers.softmax(attn) + weight_memory = fluid.layers.matmul(weight, memory) + + return weight_memory + + def forward(self, inputs): + inputs = [fluid.dygraph.to_variable(np_inp) for np_inp in inputs] + src, tar, label, src_sequence_length, tar_sequence_length = inputs + if src.shape[0] < self.batch_size: + self.batch_size = src.shape[0] + src_emb = self.src_embeder(self._transpose_batch_time(src)) + + enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) + enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) + + max_seq_len = src_emb.shape[0] + enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32") + enc_padding_mask = (enc_len_mask - 1.0) + enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) + enc_states = [[enc_hidden, enc_cell]] + enc_outputs = [] + for l in range(max_seq_len): + step_input = src_emb[l] + step_mask = enc_len_mask[l] + enc_hidden, enc_cell = enc_states[l] + new_enc_hidden, new_enc_cell = [], [] + for i in range(self.num_layers): + new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i]) + new_enc_hidden.append(new_hidden) + new_enc_cell.append(new_cell) + if self.dropout != None and self.dropout > 0.0: + step_input = fluid.layers.dropout( + new_hidden, + dropout_prob=self.dropout, + dropout_implementation='upscale_in_train') + else: + step_input = new_hidden + new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)] + new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)] + enc_states.append([new_enc_hidden, new_enc_cell]) + enc_outputs.append(step_input) + enc_outputs = fluid.layers.stack(enc_outputs) + enc_outputs = self._transpose_batch_time(enc_outputs) + + if self.mode in ['train', 'eval']: + # calculation with input_feed derives from paper: https://arxiv.org/pdf/1508.04025.pdf + input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32')) + + dec_hidden, dec_cell = enc_states[-1] + tar_emb = self.tar_embeder(self._transpose_batch_time(tar)) + max_seq_len = tar_emb.shape[0] + dec_output = [] + + for step_idx in range(max_seq_len): + step_input = tar_emb[step_idx] + step_input = fluid.layers.concat([step_input, input_feed], 1) + new_dec_hidden, new_dec_cell = [], [] + for i in range(self.num_layers): + new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + + new_dec_hidden.append(new_hidden) + new_dec_cell.append(new_cell) + if self.dropout != None and self.dropout > 0.0: + step_input = fluid.layers.dropout( + new_hidden, + dropout_prob=self.dropout, + dropout_implementation='upscale_in_train') + else: + step_input = new_hidden + + dec_att = self.attention(step_input, enc_outputs, enc_padding_mask) + dec_att = fluid.layers.squeeze(dec_att, [1]) + concat_att_out = fluid.layers.concat([dec_att, step_input], 1) + out = self.concat_fc(concat_att_out) + input_feed = out + dec_output.append(out) + dec_hidden, dec_cell = new_dec_hidden, new_dec_cell + + dec_output = fluid.layers.stack(dec_output) + dec_output = self.fc(self._transpose_batch_time(dec_output)) + + loss = fluid.layers.softmax_with_cross_entropy( + logits=dec_output, label=label, soft_label=False) + loss = fluid.layers.squeeze(loss, axes=[2]) + max_tar_seq_len = fluid.layers.shape(tar)[1] + tar_mask = fluid.layers.sequence_mask( + tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32') + loss = loss * tar_mask + loss = fluid.layers.reduce_mean(loss, dim=[0]) + loss = fluid.layers.reduce_sum(loss) + return loss + + elif self.mode in ['beam_search']: + enc_outputs = self.tile_beam_merge_with_batch(enc_outputs) + enc_padding_mask = self.tile_beam_merge_with_batch(enc_padding_mask) + batch_beam_shape = (self.batch_size, self.beam_size) + batch_beam_shape_1 = (self.batch_size, self.beam_size, 1) + vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size)) + start_token_tensor = to_variable(np.full(batch_beam_shape_1, self.beam_start_token, dtype='int64')) # remove last dim 1 in v1.7 + end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64')) + step_input = self.tar_embeder(start_token_tensor) + input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32')) + input_feed = self._expand_to_beam_size(input_feed) + input_feed = self._merge_batch_beams(input_feed) + beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32')) + beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32")) + beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1]) + + dec_hidden, dec_cell = enc_states[-1] + dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden] + dec_cell = [self._expand_to_beam_size(state) for state in dec_cell] + + batch_pos = fluid.layers.expand( + fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]), + [1, self.beam_size]) + predicted_ids = [] + parent_ids = [] + + for step_idx in range(self.beam_max_step_num): + if fluid.layers.reduce_sum(1 - beam_finished).numpy()[0] == 0: + break + step_input = self._merge_batch_beams(step_input) + step_input = fluid.layers.concat([step_input, input_feed], 1) + new_dec_hidden, new_dec_cell = [], [] + dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden] + dec_cell = [self._merge_batch_beams(state) for state in dec_cell] + + for i in range(self.num_layers): + new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + new_dec_hidden.append(new_hidden) + new_dec_cell.append(new_cell) + if self.dropout != None and self.dropout > 0.0: + step_input = fluid.layers.dropout( + new_hidden, + dropout_prob=self.dropout, + dropout_implementation='upscale_in_train') + else: + step_input = new_hidden + dec_att = self.attention(step_input, enc_outputs, enc_padding_mask) + dec_att = fluid.layers.squeeze(dec_att, [1]) + concat_att_out = fluid.layers.concat([dec_att, step_input], 1) + out = self.concat_fc(concat_att_out) + input_feed = out + cell_outputs = self._split_batch_beams(out) + cell_outputs = self.fc(cell_outputs) + step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs)) + noend_array = [-self.kinf] * self.tar_vocab_size + noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] + noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32')) + # set finished position to one-hot probability of + step_log_probs = fluid.layers.elementwise_mul( + fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]), + noend_mask_tensor, axis=-1) - \ + fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0) + log_probs = fluid.layers.elementwise_add( + x=step_log_probs, y=beam_state_log_probs, axis=0) + scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size]) + topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size) + beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam + token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam + next_log_probs = self._gather(scores, topk_indices, batch_pos) # + + new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden] + new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell] + new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden] + new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell] + + next_finished = self._gather(beam_finished, beam_indices, batch_pos) + next_finished = fluid.layers.cast(next_finished, "bool") + next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor)) + next_finished = fluid.layers.cast(next_finished, "float32") + # prepare for next step + dec_hidden, dec_cell = new_dec_hidden, new_dec_cell + beam_finished = next_finished + beam_state_log_probs = next_log_probs + step_input = self.tar_embeder(fluid.layers.unsqueeze(token_indices, 2)) # remove unsqueeze in v1.7 + predicted_ids.append(token_indices) + parent_ids.append(beam_indices) + + predicted_ids = fluid.layers.stack(predicted_ids) + parent_ids = fluid.layers.stack(parent_ids) + predicted_ids = fluid.layers.gather_tree(predicted_ids, parent_ids) + predicted_ids = self._transpose_batch_time(predicted_ids) + return predicted_ids + else: + print("not support mode ", self.mode) + raise Exception("not support mode: " + self.mode) \ No newline at end of file diff --git a/dygraph/seq2seq/base_model.py b/dygraph/seq2seq/base_model.py new file mode 100755 index 0000000000000000000000000000000000000000..120d2084ee9002795911925a3f098f9009c55aa9 --- /dev/null +++ b/dygraph/seq2seq/base_model.py @@ -0,0 +1,285 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.fluid as fluid +import numpy as np +from paddle.fluid import ParamAttr +from paddle.fluid.dygraph import to_variable +from paddle.fluid.dygraph.nn import Embedding, Linear +from rnn import BasicLSTMUnit +import numpy as np + +INF = 1. * 1e5 +alpha = 0.6 +uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) +zero_constant = fluid.initializer.Constant(0.0) + +class BaseModel(fluid.dygraph.Layer): + def __init__(self, + hidden_size, + src_vocab_size, + tar_vocab_size, + batch_size, + num_layers=1, + init_scale=0.1, + dropout=None, + beam_size=1, + beam_start_token=1, + beam_end_token=2, + beam_max_step_num=100, + mode='train'): + super(BaseModel, self).__init__() + self.hidden_size = hidden_size + self.src_vocab_size = src_vocab_size + self.tar_vocab_size = tar_vocab_size + self.batch_size = batch_size + self.num_layers = num_layers + self.init_scale = init_scale + self.dropout = dropout + self.beam_size = beam_size + self.beam_start_token = beam_start_token + self.beam_end_token = beam_end_token + self.beam_max_step_num = beam_max_step_num + self.mode = mode + self.kinf = 1e9 + + param_attr = ParamAttr(initializer=uniform_initializer(self.init_scale)) + bias_attr = ParamAttr(initializer=zero_constant) + forget_bias = 1.0 + + self.src_embeder = Embedding( + size=[self.src_vocab_size, self.hidden_size], + param_attr=fluid.ParamAttr( + initializer=uniform_initializer(init_scale))) + + self.tar_embeder = Embedding( + size=[self.tar_vocab_size, self.hidden_size], + is_sparse=False, + param_attr=fluid.ParamAttr( + initializer=uniform_initializer(init_scale))) + + self.enc_units = [] + for i in range(num_layers): + self.enc_units.append( + self.add_sublayer("enc_units_%d" % i, + BasicLSTMUnit( + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) + + self.dec_units = [] + for i in range(num_layers): + self.dec_units.append( + self.add_sublayer("dec_units_%d" % i, + BasicLSTMUnit( + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) + + self.fc = fluid.dygraph.nn.Linear(self.hidden_size, + self.tar_vocab_size, + param_attr=param_attr, + bias_attr=False) + + def _transpose_batch_time(self, x): + return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) + + def _merge_batch_beams(self, x): + return fluid.layers.reshape(x, shape=(-1,x.shape[2])) + + def _split_batch_beams(self, x): + return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1])) + + def _expand_to_beam_size(self, x): + x = fluid.layers.unsqueeze(x, [1]) + expand_times = [1] * len(x.shape) + expand_times[1] = self.beam_size + x = fluid.layers.expand(x, expand_times) + return x + + def _real_state(self, state, new_state, step_mask): + new_state = fluid.layers.elementwise_mul(new_state, step_mask, axis=0) - \ + fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0) + return new_state + + def _gather(self, x, indices, batch_pos): + topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) + return fluid.layers.gather_nd(x, topk_coordinates) + + def forward(self, inputs): + #inputs[0] = np.expand_dims(inputs[0], axis=-1) + #inputs[1] = np.expand_dims(inputs[1], axis=-1) + inputs = [fluid.dygraph.to_variable(np_inp) for np_inp in inputs] + src, tar, label, src_sequence_length, tar_sequence_length = inputs + if src.shape[0] < self.batch_size: + self.batch_size = src.shape[0] + src_emb = self.src_embeder(self._transpose_batch_time(src)) + + enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) + enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) + + max_seq_len = src_emb.shape[0] + enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32") + enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) + enc_states = [[enc_hidden, enc_cell]] + for l in range(max_seq_len): + step_input = src_emb[l] + step_mask = enc_len_mask[l] + enc_hidden, enc_cell = enc_states[l] + new_enc_hidden, new_enc_cell = [], [] + for i in range(self.num_layers): + new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i]) + new_enc_hidden.append(new_hidden) + new_enc_cell.append(new_cell) + if self.dropout != None and self.dropout > 0.0: + step_input = fluid.layers.dropout( + new_hidden, + dropout_prob=self.dropout, + dropout_implementation='upscale_in_train') + else: + step_input = new_hidden + new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)] + new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)] + enc_states.append([new_enc_hidden, new_enc_cell]) + + if self.mode in ['train', 'eval']: + dec_hidden, dec_cell = enc_states[-1] + tar_emb = self.tar_embeder(self._transpose_batch_time(tar)) + max_seq_len = tar_emb.shape[0] + dec_output = [] + + for step_idx in range(max_seq_len): + step_input = tar_emb[step_idx] + new_dec_hidden, new_dec_cell = [], [] + for i in range(self.num_layers): + new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + new_dec_hidden.append(new_hidden) + new_dec_cell.append(new_cell) + if self.dropout != None and self.dropout > 0.0: + step_input = fluid.layers.dropout( + new_hidden, + dropout_prob=self.dropout, + dropout_implementation='upscale_in_train') + else: + step_input = new_hidden + dec_output.append(step_input) + dec_hidden, dec_cell = new_dec_hidden, new_dec_cell + + dec_output = fluid.layers.stack(dec_output) + dec_output = self.fc(self._transpose_batch_time(dec_output)) + + loss = fluid.layers.softmax_with_cross_entropy( + logits=dec_output, label=label, soft_label=False) + loss = fluid.layers.squeeze(loss, axes=[2]) + max_tar_seq_len = fluid.layers.shape(tar)[1] + tar_mask = fluid.layers.sequence_mask( + tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32') + loss = loss * tar_mask + loss = fluid.layers.reduce_mean(loss, dim=[0]) + loss = fluid.layers.reduce_sum(loss) + return loss + elif self.mode in ['beam_search']: + batch_beam_shape = (self.batch_size, self.beam_size) + #batch_beam_shape_1 = (self.batch_size, self.beam_size, 1) + vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size)) + start_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_start_token, dtype='int64')) # remove last dim 1 in v1.7 + end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64')) + step_input = self.tar_embeder(start_token_tensor) + beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32')) + beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32")) + beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1]) + + dec_hidden, dec_cell = enc_states[-1] + dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden] + dec_cell = [self._expand_to_beam_size(state) for state in dec_cell] + + batch_pos = fluid.layers.expand( + fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]), + [1, self.beam_size]) + predicted_ids = [] + parent_ids = [] + + for step_idx in range(self.beam_max_step_num): + if fluid.layers.reduce_sum(1 - beam_finished).numpy()[0] == 0: + break + step_input = self._merge_batch_beams(step_input) + new_dec_hidden, new_dec_cell = [], [] + dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden] + dec_cell = [self._merge_batch_beams(state) for state in dec_cell] + + for i in range(self.num_layers): + new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + new_dec_hidden.append(new_hidden) + new_dec_cell.append(new_cell) + if self.dropout != None and self.dropout > 0.0: + step_input = fluid.layers.dropout( + new_hidden, + dropout_prob=self.dropout, + dropout_implementation='upscale_in_train') + else: + step_input = new_hidden + cell_outputs = self._split_batch_beams(step_input) + cell_outputs = self.fc(cell_outputs) + # Beam_search_step: + step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs)) + noend_array = [-self.kinf] * self.tar_vocab_size + noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] + noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32')) + # set finished position to one-hot probability of + step_log_probs = fluid.layers.elementwise_mul( + fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]), + noend_mask_tensor, axis=-1) - \ + fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0) + log_probs = fluid.layers.elementwise_add( + x=step_log_probs, y=beam_state_log_probs, axis=0) + scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size]) + topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size) + beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam + token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam + next_log_probs = self._gather(scores, topk_indices, batch_pos) # + + new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden] + new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell] + new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden] + new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell] + + next_finished = self._gather(beam_finished, beam_indices, batch_pos) + next_finished = fluid.layers.cast(next_finished, "bool") + next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor)) + next_finished = fluid.layers.cast(next_finished, "float32") + # prepare for next step + dec_hidden, dec_cell = new_dec_hidden, new_dec_cell + beam_finished = next_finished + beam_state_log_probs = next_log_probs + step_input = self.tar_embeder(fluid.layers.unsqueeze(token_indices, 2)) # remove unsqueeze in v1.7 + predicted_ids.append(token_indices) + parent_ids.append(beam_indices) + + predicted_ids = fluid.layers.stack(predicted_ids) + parent_ids = fluid.layers.stack(parent_ids) + predicted_ids = fluid.layers.gather_tree(predicted_ids, parent_ids) + predicted_ids = self._transpose_batch_time(predicted_ids) + return predicted_ids + else: + print("not support mode ", self.mode) + raise Exception("not support mode: " + self.mode) \ No newline at end of file diff --git a/dygraph/seq2seq/download.py b/dygraph/seq2seq/download.py new file mode 100755 index 0000000000000000000000000000000000000000..4dd1466d25bf16d7b4fc9bc3819fff8fe12f7adf --- /dev/null +++ b/dygraph/seq2seq/download.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Script for downloading training data. +''' +import os +import urllib +import sys + +if sys.version_info >= (3, 0): + import urllib.request +import zipfile + +URLLIB = urllib +if sys.version_info >= (3, 0): + URLLIB = urllib.request + +remote_path = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi' +base_path = 'data' +tar_path = os.path.join(base_path, 'en-vi') +filenames = [ + 'train.en', 'train.vi', 'tst2012.en', 'tst2012.vi', 'tst2013.en', + 'tst2013.vi', 'vocab.en', 'vocab.vi' +] + + +def main(arguments): + print("Downloading data......") + + if not os.path.exists(tar_path): + if not os.path.exists(base_path): + os.mkdir(base_path) + os.mkdir(tar_path) + + for filename in filenames: + url = remote_path + '/' + filename + tar_file = os.path.join(tar_path, filename) + URLLIB.urlretrieve(url, tar_file) + print("Downloaded sucess......") + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) diff --git a/dygraph/seq2seq/infer.py b/dygraph/seq2seq/infer.py new file mode 100755 index 0000000000000000000000000000000000000000..4a531661fd3280f96707ba480cb1806fc051b818 --- /dev/null +++ b/dygraph/seq2seq/infer.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import time +import os +import random +import logging +import math +import io +import paddle +import paddle.fluid as fluid + +import reader + +import sys +line_tok = '\n' +space_tok = ' ' +if sys.version[0] == '2': + reload(sys) + sys.setdefaultencoding("utf-8") + line_tok = u'\n' + space_tok = u' ' + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +from args import * +import logging +import pickle + +from attention_model import AttentionModel +from base_model import BaseModel + + +def infer(): + args = parse_args() + + num_layers = args.num_layers + src_vocab_size = args.src_vocab_size + tar_vocab_size = args.tar_vocab_size + batch_size = args.batch_size + dropout = args.dropout + init_scale = args.init_scale + max_grad_norm = args.max_grad_norm + hidden_size = args.hidden_size + # inference process + + print("src", src_vocab_size) + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + with fluid.dygraph.guard(place): + # dropout type using upscale_in_train, dropout can be remove in inferecen + # So we can set dropout to 0 + if args.attention: + model = AttentionModel( + "attention_model", + hidden_size, + src_vocab_size, + tar_vocab_size, + batch_size, + beam_size = args.beam_size, + num_layers=num_layers, + init_scale=init_scale, + dropout=0.0, + mode='beam_search') + else: + model = BaseModel( + "base_model", + hidden_size, + src_vocab_size, + tar_vocab_size, + batch_size, + beam_size = args.beam_size, + num_layers=num_layers, + init_scale=init_scale, + dropout=0.0, + mode='beam_search') + + source_vocab_file = args.vocab_prefix + "." + args.src_lang + infer_file = args.infer_file + + infer_data = reader.raw_mono_data(source_vocab_file, infer_file) + + def prepare_input(batch, epoch_id=0): + src_ids, src_mask, tar_ids, tar_mask = batch + res = {} + src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1])) + in_tar = tar_ids[:, :-1] + label_tar = tar_ids[:, 1:] + + in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1])) + label_tar = label_tar.reshape( + (label_tar.shape[0], label_tar.shape[1], 1)) + inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask] + return inputs, np.sum(tar_mask) + + dir_name = args.reload_model + print("dir name", dir_name) + state_dict, _ = fluid.dygraph.load_dygraph(dir_name) + model.set_dict(state_dict) + model.eval() + + train_data_iter = reader.get_data_iter(infer_data, batch_size, mode='infer') + + tar_id2vocab = [] + tar_vocab_file = args.vocab_prefix + "." + args.tar_lang + with io.open(tar_vocab_file, "r", encoding='utf-8') as f: + for line in f.readlines(): + tar_id2vocab.append(line.strip()) + + infer_output_file = args.infer_output_file + infer_output_dir = infer_output_file.split('/')[0] + if not os.path.exists(infer_output_dir): + os.mkdir(infer_output_dir) + + with io.open(infer_output_file, 'w', encoding='utf-8') as out_file: + + for batch_id, batch in enumerate(train_data_iter): + input_data_feed, word_num = prepare_input(batch, epoch_id=0) + outputs = model(input_data_feed) + + for i in range(outputs.shape[0]): + ins = fluid.Variable.numpy(outputs[i]) + res = [tar_id2vocab[e] for e in ins[:, 0].reshape(-1)] + new_res = [] + for ele in res: + if ele == "": + break + new_res.append(ele) + + out_file.write(space_tok.join(new_res)) + out_file.write(line_tok) + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + +if __name__ == '__main__': + check_version() + infer() diff --git a/dygraph/seq2seq/infer.sh b/dygraph/seq2seq/infer.sh new file mode 100755 index 0000000000000000000000000000000000000000..5c79b9ee838dc28bd1eec261eb5301ebadc208d0 --- /dev/null +++ b/dygraph/seq2seq/infer.sh @@ -0,0 +1,22 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=7 + +python infer.py \ + --attention True \ + --src_lang en --tar_lang vi \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 1 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --vocab_prefix data/en-vi/vocab \ + --infer_file data/en-vi/tst2013.en \ + --reload_model attention_models/epoch_10 \ + --infer_output_file attention_infer_output/infer_output.txt \ + --beam_size 10 \ + --use_gpu True + + diff --git a/dygraph/seq2seq/reader.py b/dygraph/seq2seq/reader.py new file mode 100755 index 0000000000000000000000000000000000000000..4f27560722a839c6bc51b3302fe1cc6b36064b65 --- /dev/null +++ b/dygraph/seq2seq/reader.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for parsing PTB text files.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import io +import sys +import numpy as np + +Py3 = sys.version_info[0] == 3 + +UNK_ID = 0 + + +def _read_words(filename): + data = [] + with io.open(filename, "r", encoding='utf-8') as f: + if Py3: + return f.read().replace("\n", "").split() + else: + return f.read().decode("utf-8").replace(u"\n", u"").split() + + +def read_all_line(filenam): + data = [] + with io.open(filename, "r", encoding='utf-8') as f: + for line in f.readlines(): + data.append(line.strip()) + + +def _build_vocab(filename): + + vocab_dict = {} + ids = 0 + with io.open(filename, "r", encoding='utf-8') as f: + for line in f.readlines(): + vocab_dict[line.strip()] = ids + ids += 1 + + print("vocab word num", ids) + + return vocab_dict + + +def _para_file_to_ids(src_file, tar_file, src_vocab, tar_vocab): + + src_data = [] + with io.open(src_file, "r", encoding='utf-8') as f_src: + for line in f_src.readlines(): + arra = line.strip().split() + ids = [src_vocab[w] if w in src_vocab else UNK_ID for w in arra] + ids = ids + + src_data.append(ids) + + tar_data = [] + with io.open(tar_file, "r", encoding='utf-8') as f_tar: + for line in f_tar.readlines(): + arra = line.strip().split() + ids = [tar_vocab[w] if w in tar_vocab else UNK_ID for w in arra] + + ids = [1] + ids + [2] + + tar_data.append(ids) + + return src_data, tar_data + + +def filter_len(src, tar, max_sequence_len=50): + new_src = [] + new_tar = [] + + for id1, id2 in zip(src, tar): + if len(id1) > max_sequence_len: + id1 = id1[:max_sequence_len] + if len(id2) > max_sequence_len + 2: + id2 = id2[:max_sequence_len + 2] + + new_src.append(id1) + new_tar.append(id2) + + return new_src, new_tar + + +def raw_data(src_lang, + tar_lang, + vocab_prefix, + train_prefix, + eval_prefix, + test_prefix, + max_sequence_len=50): + + src_vocab_file = vocab_prefix + "." + src_lang + tar_vocab_file = vocab_prefix + "." + tar_lang + + src_train_file = train_prefix + "." + src_lang + tar_train_file = train_prefix + "." + tar_lang + + src_eval_file = eval_prefix + "." + src_lang + tar_eval_file = eval_prefix + "." + tar_lang + + src_test_file = test_prefix + "." + src_lang + tar_test_file = test_prefix + "." + tar_lang + + src_vocab = _build_vocab(src_vocab_file) + tar_vocab = _build_vocab(tar_vocab_file) + + train_src, train_tar = _para_file_to_ids( src_train_file, tar_train_file, \ + src_vocab, tar_vocab ) + train_src, train_tar = filter_len( + train_src, train_tar, max_sequence_len=max_sequence_len) + eval_src, eval_tar = _para_file_to_ids( src_eval_file, tar_eval_file, \ + src_vocab, tar_vocab ) + + test_src, test_tar = _para_file_to_ids( src_test_file, tar_test_file, \ + src_vocab, tar_vocab ) + + return ( train_src, train_tar), (eval_src, eval_tar), (test_src, test_tar),\ + (src_vocab, tar_vocab) + + +def raw_mono_data(vocab_file, file_path): + + src_vocab = _build_vocab(vocab_file) + + test_src, test_tar = _para_file_to_ids( file_path, file_path, \ + src_vocab, src_vocab ) + + return (test_src, test_tar) + + +def get_data_iter(raw_data, + batch_size, + mode='train', + enable_ce=False, + cache_num=20): + + src_data, tar_data = raw_data + + data_len = len(src_data) + + index = np.arange(data_len) + if mode == "train" and not enable_ce: + np.random.shuffle(index) + + def to_pad_np(data, source=False): + max_len = 0 + bs = min(batch_size, len(data)) + for ele in data: + if len(ele) > max_len: + max_len = len(ele) + + ids = np.ones((bs, max_len), dtype='int64') * 2 + mask = np.zeros((bs), dtype='int32') + + for i, ele in enumerate(data): + ids[i, :len(ele)] = ele + if not source: + mask[i] = len(ele) - 1 + else: + mask[i] = len(ele) + + return ids, mask + + b_src = [] + + if mode != "train": + cache_num = 1 + for j in range(data_len): + if len(b_src) == batch_size * cache_num: + # build batch size + + # sort + if mode == 'infer': + new_cache = b_src + else: + new_cache = sorted(b_src, key=lambda k: len(k[0])) + + + for i in range(cache_num): + batch_data = new_cache[i * batch_size:(i + 1) * batch_size] + src_cache = [w[0] for w in batch_data] + tar_cache = [w[1] for w in batch_data] + src_ids, src_mask = to_pad_np(src_cache, source=True) + tar_ids, tar_mask = to_pad_np(tar_cache) + yield (src_ids, src_mask, tar_ids, tar_mask) + + b_src = [] + + b_src.append((src_data[index[j]], tar_data[index[j]])) + if len(b_src) == batch_size * cache_num or mode == 'infer': + if mode == 'infer': + new_cache = b_src + else: + new_cache = sorted(b_src, key=lambda k: len(k[0])) + + for i in range(cache_num): + batch_end = min(len(new_cache), (i + 1) * batch_size) + batch_data = new_cache[i * batch_size: batch_end] + src_cache = [w[0] for w in batch_data] + tar_cache = [w[1] for w in batch_data] + src_ids, src_mask = to_pad_np(src_cache, source=True) + tar_ids, tar_mask = to_pad_np(tar_cache) + yield (src_ids, src_mask, tar_ids, tar_mask) diff --git a/dygraph/seq2seq/rnn.py b/dygraph/seq2seq/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..9d841530ebd865e3c6e21911ef8b04da498ded15 --- /dev/null +++ b/dygraph/seq2seq/rnn.py @@ -0,0 +1,94 @@ +from paddle.fluid import layers +from paddle.fluid.dygraph import Layer + +class BasicLSTMUnit(Layer): + """ + **** + BasicLSTMUnit class, Using basic operator to build LSTM + The algorithm can be described as the code below. + .. math:: + i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i) + f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias ) + o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o) + \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c) + c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} + h_t &= o_t \odot tanh(c_t) + - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) + - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). + - sigmoid is the logistic sigmoid function. + - $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. + - The :math:`\odot` is the element-wise product of the vectors. + - :math:`tanh` is the activation functions. + - :math:`\\tilde{c_t}` is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + Args: + name_scope(string) : The name scope used to identify parameter and bias name + hidden_size (integer): The hidden size used in the Unit. + param_attr(ParamAttr|None): The parameter attribute for the learnable + weight matrix. Note: + If it is set to None or one attribute of ParamAttr, lstm_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The parameter attribute for the bias + of LSTM unit. + If it is set to None or one attribute of ParamAttr, lstm_unit will + create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized as zero. Default: None. + gate_activation (function|None): The activation function for gates (actGate). + Default: 'fluid.layers.sigmoid' + activation (function|None): The activation function for cells (actNode). + Default: 'fluid.layers.tanh' + forget_bias(float|1.0): forget bias used when computing forget gate + dtype(string): data type used in this unit + """ + + def __init__(self, + hidden_size, + input_size, + param_attr=None, + bias_attr=None, + gate_activation=None, + activation=None, + forget_bias=1.0, + dtype='float32'): + super(BasicLSTMUnit, self).__init__(dtype) + + self._hiden_size = hidden_size + self._param_attr = param_attr + self._bias_attr = bias_attr + self._gate_activation = gate_activation or layers.sigmoid + self._activation = activation or layers.tanh + self._forget_bias = layers.fill_constant( + [1], dtype=dtype, value=forget_bias) + self._forget_bias.stop_gradient = False + self._dtype = dtype + self._input_size = input_size + + self._weight = self.create_parameter( + attr=self._param_attr, + shape=[self._input_size + self._hiden_size, 4 * self._hiden_size], + dtype=self._dtype) + + self._bias = self.create_parameter( + attr=self._bias_attr, + shape=[4 * self._hiden_size], + dtype=self._dtype, + is_bias=True) + + def forward(self, input, pre_hidden, pre_cell): + concat_input_hidden = layers.concat([input, pre_hidden], 1) + gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) + + gate_input = layers.elementwise_add(gate_input, self._bias) + i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) + new_cell = layers.elementwise_add( + layers.elementwise_mul( + pre_cell, + layers.sigmoid(layers.elementwise_add(f, self._forget_bias))), + layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j))) + new_hidden = layers.tanh(new_cell) * layers.sigmoid(o) + + return new_hidden, new_cell \ No newline at end of file diff --git a/dygraph/seq2seq/run.sh b/dygraph/seq2seq/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..25bc78a3c7f3303324c0f5fb9aac14b467918a16 --- /dev/null +++ b/dygraph/seq2seq/run.sh @@ -0,0 +1,20 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0 + +python train.py \ + --src_lang en --tar_lang vi \ + --attention True \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --train_data_prefix data/en-vi/train \ + --eval_data_prefix data/en-vi/tst2012 \ + --test_data_prefix data/en-vi/tst2013 \ + --vocab_prefix data/en-vi/vocab \ + --use_gpu True \ + --model_path attention_models diff --git a/dygraph/seq2seq/train.py b/dygraph/seq2seq/train.py new file mode 100755 index 0000000000000000000000000000000000000000..a25dccc8b316c1cd61672bd47730379f49802ea2 --- /dev/null +++ b/dygraph/seq2seq/train.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import time +import os +import logging +import random +import math +import contextlib + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph_grad_clip import GradClipByGlobalNorm + +import reader + +import sys +if sys.version[0] == '2': + reload(sys) + sys.setdefaultencoding("utf-8") + +from args import * +from base_model import BaseModel +from attention_model import AttentionModel +import logging +import pickle + + +def main(): + args = parse_args() + print(args) + num_layers = args.num_layers + src_vocab_size = args.src_vocab_size + tar_vocab_size = args.tar_vocab_size + batch_size = args.batch_size + dropout = args.dropout + init_scale = args.init_scale + max_grad_norm = args.max_grad_norm + hidden_size = args.hidden_size + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + with fluid.dygraph.guard(place): + args.enable_ce = True + if args.enable_ce: + fluid.default_startup_program().random_seed = 102 + fluid.default_main_program().random_seed = 102 + np.random.seed(102) + random.seed(102) + + # Training process + + if args.attention: + model = AttentionModel( + hidden_size, + src_vocab_size, + tar_vocab_size, + batch_size, + num_layers=num_layers, + init_scale=init_scale, + dropout=dropout) + else: + model = BaseModel( + hidden_size, + src_vocab_size, + tar_vocab_size, + batch_size, + num_layers=num_layers, + init_scale=init_scale, + dropout=dropout) + gloabl_norm_clip = GradClipByGlobalNorm(max_grad_norm) + lr = args.learning_rate + opt_type = args.optimizer + if opt_type == "sgd": + optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters()) + elif opt_type == "adam": + optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters()) + else: + print("only support [sgd|adam]") + raise Exception("opt type not support") + + train_data_prefix = args.train_data_prefix + eval_data_prefix = args.eval_data_prefix + test_data_prefix = args.test_data_prefix + vocab_prefix = args.vocab_prefix + src_lang = args.src_lang + tar_lang = args.tar_lang + print("begin to load data") + raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix, + train_data_prefix, eval_data_prefix, + test_data_prefix, args.max_len) + print("finished load data") + train_data, valid_data, test_data, _ = raw_data + + def prepare_input(batch, epoch_id=0): + src_ids, src_mask, tar_ids, tar_mask = batch + res = {} + src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1])) + in_tar = tar_ids[:, :-1] + label_tar = tar_ids[:, 1:] + + in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1])) + label_tar = label_tar.reshape( + (label_tar.shape[0], label_tar.shape[1], 1)) + inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask] + return inputs, np.sum(tar_mask) + + # get train epoch size + def eval(data, epoch_id=0): + model.eval() + eval_data_iter = reader.get_data_iter(data, batch_size, mode='eval') + total_loss = 0.0 + word_count = 0.0 + for batch_id, batch in enumerate(eval_data_iter): + input_data_feed, word_num = prepare_input( + batch, epoch_id) + loss = model(input_data_feed) + + total_loss += loss * batch_size + word_count += word_num + ppl = np.exp(total_loss.numpy() / word_count) + model.train() + return ppl + + max_epoch = args.max_epoch + for epoch_id in range(max_epoch): + model.train() + start_time = time.time() + if args.enable_ce: + train_data_iter = reader.get_data_iter( + train_data, batch_size, enable_ce=True) + else: + train_data_iter = reader.get_data_iter(train_data, batch_size) + + total_loss = 0 + word_count = 0.0 + batch_times = [] + for batch_id, batch in enumerate(train_data_iter): + batch_start_time = time.time() + input_data_feed, word_num = prepare_input( + batch, epoch_id=epoch_id) + word_count += word_num + loss = model(input_data_feed) + # print(loss.numpy()[0]) + loss.backward() + optimizer.minimize(loss, grad_clip = gloabl_norm_clip) + model.clear_gradients() + total_loss += loss * batch_size + batch_end_time = time.time() + batch_time = batch_end_time - batch_start_time + batch_times.append(batch_time) + + if batch_id > 0 and batch_id % 100 == 0: + print("-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f" % + (epoch_id, batch_id, batch_time, + np.exp(total_loss.numpy() / word_count))) + total_loss = 0.0 + word_count = 0.0 + + end_time = time.time() + epoch_time = end_time - start_time + print( + "\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step\n" + % (epoch_id, epoch_time, sum(batch_times) / len(batch_times))) + + + dir_name = os.path.join(args.model_path, + "epoch_" + str(epoch_id)) + print("begin to save", dir_name) + paddle.fluid.save_dygraph(model.state_dict(), dir_name) + print("save finished") + dev_ppl = eval(valid_data) + print("dev ppl", dev_ppl) + test_ppl = eval(test_data) + print("test ppl", test_ppl) + + +def get_cards(): + num = 0 + cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cards != '': + num = len(cards.split(",")) + return num + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) + + +if __name__ == '__main__': + check_version() + main()