diff --git a/nmt_without_attention/README.md b/nmt_without_attention/README.md index a54b715102574dae1b619997a1ed7a2bfc14131c..609126dd872ca15170fa371469019d4057aa8b43 100644 --- a/nmt_without_attention/README.md +++ b/nmt_without_attention/README.md @@ -1,6 +1,8 @@ # 神经网络机器翻译模型 ## 背景介绍 +- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 + 机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。 ## 模型概览 @@ -84,7 +86,6 @@ encoded_vector = paddle.networks.bidirectional_gru( ### 无注意力机制的解码器 -PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: @@ -198,7 +199,7 @@ else: **a) 由网络定义,解析网络结构,初始化模型参数** -``` +```python # initialize model cost = seq2seq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) @@ -206,7 +207,7 @@ parameters = paddle.parameters.create(cost) **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** -``` +```python # define optimize method and trainer optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, @@ -223,7 +224,7 @@ wmt14_reader = paddle.batch( **c) 定义事件句柄,打印训练中间结果、保存模型快照** -``` +```python # define event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): @@ -242,7 +243,7 @@ def event_handler(event): **d) 开始训练** -``` +```python # start to train trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) @@ -250,13 +251,13 @@ trainer.train( 启动模型训练的十分简单,只需在命令行窗口中执行 -``` -python nmt_without_attention_v2.py --train +```bash +python train.py ``` 输出样例为 -``` +```text Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0} ......... Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498} @@ -274,7 +275,7 @@ Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.83481836 **a) 加载测试样本** -``` +```python # load data samples for generation gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) gen_data = [] @@ -284,7 +285,7 @@ for item in gen_creator(): **b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** -``` +```python beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) with gzip.open(init_models_path) as f: parameters = paddle.parameters.Parameters.from_tar(f) @@ -298,7 +299,7 @@ beam_result = paddle.infer( **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** -``` +```python # get the dictionary src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) @@ -323,19 +324,19 @@ for i in xrange(len(gen_data)): 模型测试的执行与模型训练类似,只需执行 -``` -python nmt_without_attention_v2.py --generate +```bash +python generate.py ``` 则自动为测试数据生成了对应的翻译结果。 设置beam search的宽度为3,输入某个法文句子 -``` +```text src: Elles connaissent leur entreprise mieux que personne . ``` 其对应的英文翻译结果为 -``` +```text prob = -3.754819: They know their business better than anyone . prob = -4.445528: They know their businesses better than anyone . prob = -5.026885: They know their business better than anybody . diff --git a/nmt_without_attention/generate.py b/nmt_without_attention/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..e3c220685e2bc75a13f57d4f38faa28cb31a4f65 --- /dev/null +++ b/nmt_without_attention/generate.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +import os +from network_conf import * + + +def infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict): + beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + + # the delimited element of generated sequences is -1, + # the first element of each generated sequence is the sequence length + seq_list, seq = [], [] + for w in beam_result[1]: + if w != -1: + seq.append(w) + else: + seq_list.append(" ".join([trg_dict.get(w) for w in seq[1:]])) + seq = [] + + prob = beam_result[0] + for i, sample in enumerate(test_batch): + print("src:", " ".join([src_dict.get(w) for w in sample[0]]), "\n") + for j in xrange(beam_size): + print("prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]) + print("\n") + + +def generate(source_dict_dim, target_dict_dim, model_path, batch_size): + """ + Generating function for NMT + + :param source_dict_dim: size of source dictionary + :type source_dict_dim: int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + :param model_path: path for inital model + :type model_path: string + """ + + assert os.path.exists(model_path), "trained model does not exist." + + # step 1: prepare dictionary + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) + beam_size = 5 + + # step 2: load the trained model + paddle.init(use_gpu=True, trainer_count=1) + with gzip.open(model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + beam_gen = seq2seq_net( + source_dict_dim, + target_dict_dim, + beam_size=beam_size, + max_length=100, + is_generating=True) + inferer = paddle.inference.Inference( + output_layer=beam_gen, parameters=parameters) + + # step 3: iterating over the testing dataset + test_batch = [] + for idx, item in enumerate(paddle.dataset.wmt14.gen(source_dict_dim)()): + test_batch.append([item[0]]) + if len(test_batch) == batch_size: + infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict) + test_batch = [] + + if len(test_batch): + infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict) + test_batch = [] + + +if __name__ == "__main__": + generate( + source_dict_dim=3000, + target_dict_dim=3000, + batch_size=5, + model_path="models/nmt_without_att_params_batch_00001.tar.gz") diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html index 35177ee5a679fe4f826dfd219721ef2e36b7df83..b68bdb5b6d1c84f8cde6f66583788f678c79e858 100644 --- a/nmt_without_attention/index.html +++ b/nmt_without_attention/index.html @@ -43,6 +43,8 @@ # 神经网络机器翻译模型 ## 背景介绍 +- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 + 机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。 ## 模型概览 @@ -126,7 +128,6 @@ encoded_vector = paddle.networks.bidirectional_gru( ### 无注意力机制的解码器 -PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: @@ -240,7 +241,7 @@ else: **a) 由网络定义,解析网络结构,初始化模型参数** -``` +```python # initialize model cost = seq2seq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) @@ -248,7 +249,7 @@ parameters = paddle.parameters.create(cost) **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** -``` +```python # define optimize method and trainer optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, @@ -265,7 +266,7 @@ wmt14_reader = paddle.batch( **c) 定义事件句柄,打印训练中间结果、保存模型快照** -``` +```python # define event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): @@ -284,7 +285,7 @@ def event_handler(event): **d) 开始训练** -``` +```python # start to train trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) @@ -292,13 +293,13 @@ trainer.train( 启动模型训练的十分简单,只需在命令行窗口中执行 -``` -python nmt_without_attention_v2.py --train +```bash +python train.py ``` 输出样例为 -``` +```text Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0} ......... Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498} @@ -316,7 +317,7 @@ Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.83481836 **a) 加载测试样本** -``` +```python # load data samples for generation gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) gen_data = [] @@ -326,7 +327,7 @@ for item in gen_creator(): **b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** -``` +```python beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) with gzip.open(init_models_path) as f: parameters = paddle.parameters.Parameters.from_tar(f) @@ -340,7 +341,7 @@ beam_result = paddle.infer( **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** -``` +```python # get the dictionary src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) @@ -365,19 +366,19 @@ for i in xrange(len(gen_data)): 模型测试的执行与模型训练类似,只需执行 -``` -python nmt_without_attention_v2.py --generate +```bash +python generate.py ``` 则自动为测试数据生成了对应的翻译结果。 设置beam search的宽度为3,输入某个法文句子 -``` +```text src: Elles connaissent leur entreprise mieux que personne . ``` 其对应的英文翻译结果为 -``` +```text prob = -3.754819: They know their business better than anyone . prob = -4.445528: They know their businesses better than anyone . prob = -5.026885: They know their business better than anybody . diff --git a/nmt_without_attention/network_conf.py b/nmt_without_attention/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..77a1dc77c3c85c633cd7fbdf085d02780ded0075 --- /dev/null +++ b/nmt_without_attention/network_conf.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +import paddle.v2 as paddle +import sys +import gzip + + +def seq2seq_net(source_dict_dim, + target_dict_dim, + word_vector_dim=620, + rnn_hidden_size=1000, + beam_size=1, + max_length=50, + is_generating=False): + """ + Define the network structure of NMT, including encoder and decoder. + + :param source_dict_dim: size of source dictionary + :type source_dict_dim : int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + :param word_vector_dim: size of source language word embedding + :type word_vector_dim: int + :param rnn_hidden_size: size of hidden state of encoder and decoder RNN + :type rnn_hidden_size: int + :param beam_size: expansion width in each step when generating + :type beam_size: int + :param max_length: max iteration number in generation + :type max_length: int + :param generating: whether to generate sequence or to train + :type generating: bool + """ + + decoder_size = encoder_size = rnn_hidden_size + + src_word_id = paddle.layer.data( + name="source_language_word", + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_embedding = paddle.layer.embedding( + input=src_word_id, size=word_vector_dim) + + # use bidirectional_gru as the encoder + encoded_vector = paddle.networks.bidirectional_gru( + input=src_embedding, + size=encoder_size, + fwd_act=paddle.activation.Tanh(), + fwd_gate_act=paddle.activation.Sigmoid(), + bwd_act=paddle.activation.Tanh(), + bwd_gate_act=paddle.activation.Sigmoid(), + return_seq=True) + #### Decoder + encoder_last = paddle.layer.last_seq(input=encoded_vector) + encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) + + # gru step + def gru_decoder_without_attention(enc_vec, current_word): + """ + Step function for gru decoder + + :param enc_vec: encoded vector of source language + :type enc_vec: layer object + :param current_word: current input of decoder + :type current_word: layer object + """ + decoder_mem = paddle.layer.memory( + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) + + context = paddle.layer.last_seq(input=enc_vec) + + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) + + gru_step = paddle.layer.gru_step( + name="gru_decoder", + act=paddle.activation.Tanh(), + gate_act=paddle.activation.Sigmoid(), + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + out = paddle.layer.fc( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=gru_step) + return out + + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_inputs = [group_input1] + + decoder_group_name = "decoder_group" + if is_generating: + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name="_target_language_embedding", + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + + return beam_gen + else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost diff --git a/nmt_without_attention/nmt_without_attention.py b/nmt_without_attention/nmt_without_attention.py deleted file mode 100644 index 5a61b525e67f7d07f66ae8cc5064c0244bc0b6f3..0000000000000000000000000000000000000000 --- a/nmt_without_attention/nmt_without_attention.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python - -import sys -import gzip -import paddle.v2 as paddle - -### Parameters -word_vector_dim = 620 -latent_chain_dim = 1000 - -beam_size = 5 -max_length = 50 - - -def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): - ''' - Define the network structure of NMT, including encoder and decoder. - - :param source_dict_dim: size of source dictionary - :type source_dict_dim : int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - ''' - - decoder_size = encoder_size = latent_chain_dim - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, size=word_vector_dim) - # use bidirectional_gru - encoded_vector = paddle.networks.bidirectional_gru( - input=src_embedding, - size=encoder_size, - fwd_act=paddle.activation.Tanh(), - fwd_gate_act=paddle.activation.Sigmoid(), - bwd_act=paddle.activation.Tanh(), - bwd_gate_act=paddle.activation.Sigmoid(), - return_seq=True) - #### Decoder - encoder_last = paddle.layer.last_seq(input=encoded_vector) - encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) - - # gru step - def gru_decoder_without_attention(enc_vec, current_word): - ''' - Step function for gru decoder - - :param enc_vec: encoded vector of source language - :type enc_vec: layer object - :param current_word: current input of decoder - :type current_word: layer object - ''' - decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) - - context = paddle.layer.last_seq(input=enc_vec) - - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - act=paddle.activation.Tanh(), - gate_act=paddle.activation.Sigmoid(), - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - out = paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_inputs = [group_input1] - - if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost - else: - - trg_embedding = paddle.layer.GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - return beam_gen - - -def train(source_dict_dim, target_dict_dim): - ''' - Training function for NMT - - :param source_dict_dim: size of source dictionary - :type source_dict_dim: int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - ''' - # initialize model - cost = seq2seq_net(source_dict_dim, target_dict_dim) - parameters = paddle.parameters.create(cost) - - # define optimize method and trainer - optimizer = paddle.optimizer.RMSProp( - learning_rate=1e-3, - gradient_clipping_threshold=10.0, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=optimizer) - # define data reader - wmt14_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), - batch_size=55) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: - parameters.to_tar(f) - - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # start to train - trainer.train( - reader=wmt14_reader, event_handler=event_handler, num_passes=2) - - -def generate(source_dict_dim, target_dict_dim, init_models_path): - ''' - Generating function for NMT - - :param source_dict_dim: size of source dictionary - :type source_dict_dim: int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - :param init_models_path: path for inital model - :type init_models_path: string - ''' - - # load data samples for generation - gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) - gen_data = [] - for item in gen_creator(): - gen_data.append((item[0], )) - - beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) - with gzip.open(init_models_path) as f: - parameters = paddle.parameters.Parameters.from_tar(f) - # prob is the prediction probabilities, and id is the prediction word. - beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) - - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list, seq = [], [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] - - -def usage_helper(): - print "Please specify training/generating phase!" - print "Usage: python nmt_without_attention_v2.py --train/generate" - exit(1) - - -def main(): - if not (len(sys.argv) == 2): - usage_helper() - if sys.argv[1] == '--train': - generating = False - elif sys.argv[1] == '--generate': - generating = True - else: - usage_helper() - - # initialize paddle - paddle.init(use_gpu=False, trainer_count=1) - source_language_dict_dim = 30000 - target_language_dict_dim = 30000 - - if generating: - # modify this path to speicify a trained model. - init_models_path = 'models/nmt_without_att_params_batch_1800.tar.gz' - if not os.path.exists(init_models_path): - print "trained model cannot be found." - exit(1) - generate(source_language_dict_dim, target_language_dict_dim, - init_models_path) - else: - if not os.path.exists('./models'): - os.system('mkdir ./models') - train(source_language_dict_dim, target_language_dict_dim) - - -if __name__ == '__main__': - main() diff --git a/nmt_without_attention/train.py b/nmt_without_attention/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6b847fa8e04334407f127fb3a5ed78f610111fb1 --- /dev/null +++ b/nmt_without_attention/train.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +from network_conf import * + + +def train(source_dict_dim, target_dict_dim): + ''' + Training function for NMT + + :param source_dict_dim: size of source dictionary + :type source_dict_dim: int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + ''' + # initialize model + paddle.init(use_gpu=False, trainer_count=1) + + cost = seq2seq_net(source_dict_dim, target_dict_dim) + parameters = paddle.parameters.create(cost) + + # define optimize method and trainer + optimizer = paddle.optimizer.RMSProp( + learning_rate=1e-3, + gradient_clipping_threshold=10.0, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader + wmt14_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), + batch_size=8) + + # define event_handler callback + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if not event.batch_id % 500 and event.batch_id: + with gzip.open("models/nmt_without_att_params_batch_%05d.tar.gz" + % event.batch_id, "w") as f: + parameters.to_tar(f) + + if event.batch_id and not event.batch_id % 10: + print("\nPass %d, Batch %d, Cost %f, %s" % + (event.pass_id, event.batch_id, event.cost, + event.metrics)) + else: + sys.stdout.write('.') + sys.stdout.flush() + + # start to train + trainer.train( + reader=wmt14_reader, event_handler=event_handler, num_passes=2) + + +if __name__ == '__main__': + train(source_dict_dim=3000, target_dict_dim=3000)