diff --git a/nmt_without_attention/README.md b/nmt_without_attention/README.md index 38361bbfbc3e029de872eba967a17453c5e7dac1..a54b715102574dae1b619997a1ed7a2bfc14131c 100644 --- a/nmt_without_attention/README.md +++ b/nmt_without_attention/README.md @@ -91,11 +91,11 @@ PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08 ```python #### Decoder encoder_last = paddle.layer.last_seq(input=encoded_vector) -with paddle.layer.mixed( +encoder_last_projected = paddle.layer.mixed( size=decoder_size, - act=paddle.activation.Tanh()) as encoder_last_projected: - encoder_last_projected += paddle.layer.full_matrix_projection( - input=encoder_last) + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(input=encoder_last)) + # gru step def gru_decoder_without_attention(enc_vec, current_word): ''' @@ -112,10 +112,12 @@ def gru_decoder_without_attention(enc_vec, current_word): context = paddle.layer.last_seq(input=enc_vec) - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs +=paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -125,24 +127,24 @@ def gru_decoder_without_attention(enc_vec, current_word): output_mem=decoder_mem, size=decoder_size) - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: - **训练阶段**:目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`,函数`recurrent_group()`循环调用单步逻辑执行,最后计算目标翻译与实际解码的差异cost并返回; -- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInputV2()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。 +- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) group_inputs = [group_input1] if not generating: trg_embedding = paddle.layer.embedding( @@ -166,7 +168,7 @@ if not generating: return cost else: - trg_embedding = paddle.layer.GeneratedInputV2( + trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html index d749ff5722aa4144743fdca45f2ac0418c9db0b3..35177ee5a679fe4f826dfd219721ef2e36b7df83 100644 --- a/nmt_without_attention/index.html +++ b/nmt_without_attention/index.html @@ -133,11 +133,11 @@ PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08 ```python #### Decoder encoder_last = paddle.layer.last_seq(input=encoded_vector) -with paddle.layer.mixed( +encoder_last_projected = paddle.layer.mixed( size=decoder_size, - act=paddle.activation.Tanh()) as encoder_last_projected: - encoder_last_projected += paddle.layer.full_matrix_projection( - input=encoder_last) + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(input=encoder_last)) + # gru step def gru_decoder_without_attention(enc_vec, current_word): ''' @@ -154,10 +154,12 @@ def gru_decoder_without_attention(enc_vec, current_word): context = paddle.layer.last_seq(input=enc_vec) - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs +=paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -167,24 +169,24 @@ def gru_decoder_without_attention(enc_vec, current_word): output_mem=decoder_mem, size=decoder_size) - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: - **训练阶段**:目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`,函数`recurrent_group()`循环调用单步逻辑执行,最后计算目标翻译与实际解码的差异cost并返回; -- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInputV2()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。 +- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) group_inputs = [group_input1] if not generating: trg_embedding = paddle.layer.embedding( @@ -208,7 +210,7 @@ if not generating: return cost else: - trg_embedding = paddle.layer.GeneratedInputV2( + trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) diff --git a/nmt_without_attention/nmt_without_attention.py b/nmt_without_attention/nmt_without_attention.py index e5a4e1b602226da802c5903d83c0d963ae37bd44..5a61b525e67f7d07f66ae8cc5064c0244bc0b6f3 100644 --- a/nmt_without_attention/nmt_without_attention.py +++ b/nmt_without_attention/nmt_without_attention.py @@ -16,7 +16,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): ''' Define the network structure of NMT, including encoder and decoder. - :param source_dict_dim: size of source dictionary + :param source_dict_dim: size of source dictionary :type source_dict_dim : int :param target_dict_dim: size of target dictionary :type target_dict_dim: int @@ -41,11 +41,11 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): return_seq=True) #### Decoder encoder_last = paddle.layer.last_seq(input=encoded_vector) - with paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh()) as encoder_last_projected: - encoder_last_projected += paddle.layer.full_matrix_projection( - input=encoder_last) + encoder_last_projected = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(input=encoder_last)) + # gru step def gru_decoder_without_attention(enc_vec, current_word): ''' @@ -63,10 +63,12 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): context = paddle.layer.last_seq(input=enc_vec) - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -76,15 +78,15 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): output_mem=decoder_mem, size=decoder_size) - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) group_inputs = [group_input1] if not generating: @@ -109,7 +111,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): return cost else: - trg_embedding = paddle.layer.GeneratedInputV2( + trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) @@ -194,7 +196,7 @@ def generate(source_dict_dim, target_dict_dim, init_models_path): beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) with gzip.open(init_models_path) as f: parameters = paddle.parameters.Parameters.from_tar(f) - # prob is the prediction probabilities, and id is the prediction word. + # prob is the prediction probabilities, and id is the prediction word. beam_result = paddle.infer( output_layer=beam_gen, parameters=parameters, @@ -244,10 +246,10 @@ def main(): target_language_dict_dim = 30000 if generating: - # shoud pass the right generated model's path here + # modify this path to speicify a trained model. init_models_path = 'models/nmt_without_att_params_batch_1800.tar.gz' if not os.path.exists(init_models_path): - print "Cannot find models for generation" + print "trained model cannot be found." exit(1) generate(source_language_dict_dim, target_language_dict_dim, init_models_path)