From ff9a664dba3b7b3689d37e087327e2d0b59156fd Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Wed, 8 Mar 2017 15:29:19 +0800
Subject: [PATCH] also fix machine translation

---
 machine_translation/README.en.md  | 6 ++++++
 machine_translation/README.md     | 7 +++++--
 machine_translation/index.en.html | 6 ++++++
 machine_translation/index.html    | 7 +++++--
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/machine_translation/README.en.md b/machine_translation/README.en.md
index 8a7f109..3713ec5 100644
--- a/machine_translation/README.en.md
+++ b/machine_translation/README.en.md
@@ -446,6 +446,7 @@ settings(
 This tutorial will use the default SGD and Adam learning algorithm, with a learning rate of 5e-4. Note that the `batch_size = 50` denotes generating 50 sequence each time.
 
 ### Model Structure
+
 1. Define some global variables
 
    ```python
@@ -493,6 +494,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    with mixed_layer(size=decoder_size) as encoded_proj:
        encoded_proj += full_matrix_projection(input=encoded_vector)
    ```
+
    3.2 Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
 
    ```python
@@ -502,6 +504,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            act=TanhActivation(), ) as decoder_boot:
        decoder_boot += full_matrix_projection(input=backward_first)
    ```
+
    3.3 Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
 
       - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
@@ -536,6 +539,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            out += full_matrix_projection(input=gru_step)
        return out
     ```
+
 4. Decoder differences between the training and generation
 
    4.1 Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
@@ -546,6 +550,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]
    ```
+
    4.2 In training mode:
 
       - word embedding from the target langauge trg_embedding is passed to `gru_decoder_with_attention` as current_word.
@@ -571,6 +576,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
        cost = classification_cost(input=decoder, label=lbl)
        outputs(cost)
    ```
+
    4.3 In generation mode:
 
       - during generation, as the decoder RNN will take the word vector generated from the previous time step as input, `GeneratedInput` is used to implement this automatically. Please refer to [GeneratedInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for details.
diff --git a/machine_translation/README.md b/machine_translation/README.md
index 3eec2b6..d1902a2 100644
--- a/machine_translation/README.md
+++ b/machine_translation/README.md
@@ -340,6 +340,7 @@ wmt14_reader = paddle.batch(
             out += paddle.layer.full_matrix_projection(input=gru_step)
         return out
     ```
+
 4. 训练模式与生成模式下的解码器调用区别。
 
    4.1 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
@@ -400,6 +401,7 @@ for param in parameters.keys():
 ```
 
 ### 训练模型
+
 1. 构造trainer
 
     根据优化目标cost,网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法。
@@ -409,7 +411,7 @@ for param in parameters.keys():
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer)
-```
+    ```
 
 2. 构造event_handler
 
@@ -421,6 +423,7 @@ for param in parameters.keys():
                 print "Pass %d, Batch %d, Cost %f, %s" % (
                     event.pass_id, event.batch_id, event.cost, event.metrics)
     ```
+
 3. 启动训练：
 
     ```python
@@ -435,7 +438,7 @@ for param in parameters.keys():
     Pass 0, Batch 0, Cost 247.408008, {'classification_error_evaluator': 1.0}
     Pass 0, Batch 10, Cost 212.058789, {'classification_error_evaluator': 0.8737863898277283}
     ...
-```
+    ```
 
 
 ## 应用模型
diff --git a/machine_translation/index.en.html b/machine_translation/index.en.html
index 06da2eb..bafae0e 100644
--- a/machine_translation/index.en.html
+++ b/machine_translation/index.en.html
@@ -488,6 +488,7 @@ settings(
 This tutorial will use the default SGD and Adam learning algorithm, with a learning rate of 5e-4. Note that the `batch_size = 50` denotes generating 50 sequence each time.
 
 ### Model Structure
+
 1. Define some global variables
 
    ```python
@@ -535,6 +536,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    with mixed_layer(size=decoder_size) as encoded_proj:
        encoded_proj += full_matrix_projection(input=encoded_vector)
    ```
+
    3.2 Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
 
    ```python
@@ -544,6 +546,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            act=TanhActivation(), ) as decoder_boot:
        decoder_boot += full_matrix_projection(input=backward_first)
    ```
+
    3.3 Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
 
       - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
@@ -578,6 +581,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            out += full_matrix_projection(input=gru_step)
        return out
     ```
+
 4. Decoder differences between the training and generation
 
    4.1 Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
@@ -588,6 +592,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]
    ```
+
    4.2 In training mode:
 
       - word embedding from the target langauge trg_embedding is passed to `gru_decoder_with_attention` as current_word.
@@ -613,6 +618,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
        cost = classification_cost(input=decoder, label=lbl)
        outputs(cost)
    ```
+
    4.3 In generation mode:
 
       - during generation, as the decoder RNN will take the word vector generated from the previous time step as input, `GeneratedInput` is used to implement this automatically. Please refer to [GeneratedInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for details.
diff --git a/machine_translation/index.html b/machine_translation/index.html
index 78a7ea9..a960a2c 100644
--- a/machine_translation/index.html
+++ b/machine_translation/index.html
@@ -382,6 +382,7 @@ wmt14_reader = paddle.batch(
             out += paddle.layer.full_matrix_projection(input=gru_step)
         return out
     ```
+
 4. 训练模式与生成模式下的解码器调用区别。
 
    4.1 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
@@ -442,6 +443,7 @@ for param in parameters.keys():
 ```
 
 ### 训练模型
+
 1. 构造trainer
 
     根据优化目标cost,网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法。
@@ -451,7 +453,7 @@ for param in parameters.keys():
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer)
-```
+    ```
 
 2. 构造event_handler
 
@@ -463,6 +465,7 @@ for param in parameters.keys():
                 print "Pass %d, Batch %d, Cost %f, %s" % (
                     event.pass_id, event.batch_id, event.cost, event.metrics)
     ```
+
 3. 启动训练：
 
     ```python
@@ -477,7 +480,7 @@ for param in parameters.keys():
     Pass 0, Batch 0, Cost 247.408008, {'classification_error_evaluator': 1.0}
     Pass 0, Batch 10, Cost 212.058789, {'classification_error_evaluator': 0.8737863898277283}
     ...
-```
+    ```
 
 
 ## 应用模型
-- 
GitLab