Merge pull request #339 from lcy-seso/refine_wmt

Refine Chapter 8 WMT.

Merge pull request #339 from lcy-seso/refine_wmt
Refine Chapter 8 WMT.
a5a71eab · Cao Ying · GitHub · 77fa0921 · 63265281 · a5a71eab
5 changed file
--- a/08.machine_translation/README.cn.md
+++ b/08.machine_translation/README.cn.md
@@ -185,16 +185,16 @@ is_generating = False
 ### 模型结构
 1. 首先，定义了一些全局变量。

-   ```python
-   dict_size = 30000 # 字典维度
-   source_dict_dim = dict_size # 源语言字典维度
-   target_dict_dim = dict_size # 目标语言字典维度
-   word_vector_dim = 512 # 词向量维度
-   encoder_size = 512 # 编码器中的GRU隐层大小
-   decoder_size = 512 # 解码器中的GRU隐层大小
-   beam_size = 3 # 柱宽度
-   max_length = 250 # 生成句子的最大长度
-  ```
+    ```python
+    dict_size = 30000 # 字典维度
+    source_dict_dim = dict_size # 源语言字典维度
+    target_dict_dim = dict_size # 目标语言字典维度
+    word_vector_dim = 512 # 词向量维度
+    encoder_size = 512 # 编码器中的GRU隐层大小
+    decoder_size = 512 # 解码器中的GRU隐层大小
+    beam_size = 3 # 柱宽度
+    max_length = 250 # 生成句子的最大长度
+    ```

 2. 其次，实现编码器框架。分为三步：

@@ -209,9 +209,7 @@ is_generating = False

   ```python
    src_embedding = paddle.layer.embedding(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+        input=src_word_id, size=word_vector_dim)
   ```
   - 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到$\mathbf{h}$。

@@ -228,19 +226,22 @@ is_generating = False
   - 对源语言序列编码后的结果（见2的最后一步），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。

   ```python
-   encoded_proj = paddle.layer.mixed(
-       size=decoder_size,
-       input=paddle.layer.full_matrix_projection(encoded_vector))
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
   ```

   - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。

   ```python
   backward_first = paddle.layer.first_seq(input=src_backward)
-   decoder_boot = paddle.layer.mixed(
-       size=decoder_size,
-       act=paddle.activation.Tanh(),
-       input=paddle.layer.full_matrix_projection(backward_first))
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
   ```

   - 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
@@ -260,12 +261,13 @@ is_generating = False
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

-        decoder_inputs = paddle.layer.mixed(
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
            size=decoder_size * 3,
-            input=[
-                paddle.layer.full_matrix_projection(input=context),
-                paddle.layer.full_matrix_projection(input=current_word)
-            ])
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+                error_clipping_threshold=100.0))

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
@@ -285,8 +287,8 @@ is_generating = False

    ```python
    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
    group_inputs = [group_input1, group_input2]
    ```

@@ -301,7 +303,7 @@ is_generating = False
   if not is_generating:
       trg_embedding = paddle.layer.embedding(
           input=paddle.layer.data(
-               name='target_language_word',  
+               name='target_language_word',
               type=paddle.data_type.integer_value_sequence(target_dict_dim)),
           size=word_vector_dim,
           param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
@@ -330,14 +332,13 @@ is_generating = False

   ```python
   if is_generating:
-       # In generation, the decoder predicts a next target word based on
-       # the encoded source sequence and the last generated target word.
+      # In generation, the decoder predicts a next target word based on
+      # the encoded source sequence and the previous generated target word.

-       # The encoded source sequence (encoder's output) must be specified by
-       # StaticInput, which is a read-only memory.
-       # Embedding of the last generated word is automatically gotten by
-       # GeneratedInputs, which is initialized by a start mark, such as <s>,
-       # and must be included in generation.
+      # The encoded source sequence (encoder's output) must be specified by
+      # StaticInput, which is a read-only memory.
+      # Embedding of the previous generated word is automatically retrieved
+      # by GeneratedInputs initialized by a start mark <s>.

       trg_embedding = paddle.layer.GeneratedInput(
           size=target_dict_dim,
@@ -468,36 +469,31 @@ is_generating = False

    ```python
    if is_generating:
-        # get the dictionary
+        # load the dictionary
        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

-        # the delimited element of generated sequences is -1,
-        # the first element of each generated sequence is the sequence length
-        seq_list = []
-        seq = []
-        for w in beam_result[1]:
-            if w != -1:
-                seq.append(w)
-            else:
-                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
-                seq = []
-
-        prob = beam_result[0]
-        for i in xrange(gen_num):
-            print "\n*******************************************************\n"
-            print "src:", ' '.join(
-                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+        gen_sen_idx = np.where(beam_result[1] == -1)[0]
+        assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+        # -1 is the delimiter of generated sequences.
+        # the first element of each generated sequence its length.
+        start_pos, end_pos = 1, 0
+        for i, sample in enumerate(gen_data):
+            print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
            for j in xrange(beam_size):
-                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
+                end_pos = gen_sen_idx[i * beam_size + j]
+                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+                start_pos = end_pos + 2
+            print("\n")
    ```

  生成开始后，可以观察到输出的日志如下：
  ```text
-  src: <s> Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu <e>
-
-  prob = -19.019573: The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
-  prob = -19.113066: The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
-  prob = -19.512890: The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
  ```

 ## 总结

--- a/08.machine_translation/README.md
+++ b/08.machine_translation/README.md
@@ -230,34 +230,32 @@ is_generating = False
   decoder_size = 512 # hidden layer size of GRU in decoder
   beam_size = 3 # expand width in beam search
   max_length = 250 # a stop condition of sequence generation
-  ```
+   ```

 2. Implement Encoder as follows:
   - Input is a sequence of words represented by an integer word index sequence. So we define data layer of data type `integer_value_sequence`. The value range of each element in the sequence is `[0, source_dict_dim)`

   ```python
-    src_word_id = paddle.layer.data(
-        name='source_language_word',
-        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+   src_word_id = paddle.layer.data(
+       name='source_language_word',
+       type=paddle.data_type.integer_value_sequence(source_dict_dim))
   ```

   - Map the one-hot vector (represented by word index) into a word vector $\mathbf{s}$ in a low-dimensional semantic space

   ```python
-    src_embedding = paddle.layer.embedding(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+   src_embedding = paddle.layer.embedding(
+       input=src_word_id, size=word_vector_dim)
   ```

   - Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$

   ```python
-    src_forward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size)
-    src_backward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size, reverse=True)
-    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+   src_forward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size)
+   src_backward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size, reverse=True)
+   encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
   ```

 3. Implement Attention-based Decoder as follows:
@@ -265,19 +263,22 @@ is_generating = False
   - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network

   ```python
-   encoded_proj = paddle.layer.mixed(
-       size=decoder_size,
-       input=paddle.layer.full_matrix_projection(encoded_vector))
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
   ```

   - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$

   ```python
   backward_first = paddle.layer.first_seq(input=src_backward)
-   decoder_boot = paddle.layer.mixed(
-       size=decoder_size,
-       act=paddle.activation.Tanh(),
-       input=paddle.layer.full_matrix_projection(backward_first))
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
   ```

   - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
@@ -298,12 +299,13 @@ is_generating = False
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

-        decoder_inputs = paddle.layer.mixed(
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
            size=decoder_size * 3,
-            input=[
-                paddle.layer.full_matrix_projection(input=context),
-                paddle.layer.full_matrix_projection(input=current_word)
-            ])
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
@@ -311,11 +313,11 @@ is_generating = False
            output_mem=decoder_mem,
            size=decoder_size)

-        out = paddle.layer.mixed(
+        out = paddle.layer.fc(
            size=target_dict_dim,
            bias_attr=True,
            act=paddle.activation.Softmax(),
-            input=paddle.layer.full_matrix_projection(input=gru_step))
+            input=gru_step)
        return out
   ```

@@ -323,8 +325,8 @@ is_generating = False

    ```python
    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
    group_inputs = [group_input1, group_input2]
    ```

@@ -369,13 +371,12 @@ is_generating = False
   ```python
   if is_generating:
       # In generation, the decoder predicts a next target word based on
-       # the encoded source sequence and the last generated target word.
+       # the encoded source sequence and the previous generated target word.

       # The encoded source sequence (encoder's output) must be specified by
       # StaticInput, which is a read-only memory.
-       # Embedding of the last generated word is automatically gotten by
-       # GeneratedInputs, which is initialized by a start mark, such as <s>,
-       # and must be included in generation.
+       # Embedding of the previous generated word is automatically retrieved
+       # by GeneratedInputs initialized by a start mark <s>.

       trg_embedding = paddle.layer.GeneratedInput(
           size=target_dict_dim,
@@ -504,36 +505,31 @@ Note: Our configuration is based on Bahdanau et al. \[[4](#Reference)\] but with

   ```python
   if is_generating:
-        # get the dictionary
-        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-
-        # the delimited element of generated sequences is -1,
-        # the first element of each generated sequence is the sequence length
-        seq_list = []
-        seq = []
-        for w in beam_result[1]:
-            if w != -1:
-                seq.append(w)
-            else:
-                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
-                seq = []
-
-        prob = beam_result[0]
-        for i in xrange(gen_num):
-            print "\n*******************************************************\n"
-            print "src:", ' '.join(
-                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
-            for j in xrange(beam_size):
-                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
+       # load the dictionary
+       src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+       gen_sen_idx = np.where(beam_result[1] == -1)[0]
+       assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+       # -1 is the delimiter of generated sequences.
+       # the first element of each generated sequence its length.
+       start_pos, end_pos = 1, 0
+       for i, sample in enumerate(gen_data):
+           print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
+           for j in xrange(beam_size):
+               end_pos = gen_sen_idx[i * beam_size + j]
+               print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                     trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+               start_pos = end_pos + 2
+           print("\n")
   ```

  The generating log is as follows:
  ```text
-  src: <s> Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu <e>
-
-  prob = -19.019573: The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
-  prob = -19.113066: The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
-  prob = -19.512890: The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
  ```

 ## Summary

--- a/08.machine_translation/index.cn.html
+++ b/08.machine_translation/index.cn.html
@@ -227,16 +227,16 @@ is_generating = False
 ### 模型结构
 1. 首先，定义了一些全局变量。

-   ```python
-   dict_size = 30000 # 字典维度
-   source_dict_dim = dict_size # 源语言字典维度
-   target_dict_dim = dict_size # 目标语言字典维度
-   word_vector_dim = 512 # 词向量维度
-   encoder_size = 512 # 编码器中的GRU隐层大小
-   decoder_size = 512 # 解码器中的GRU隐层大小
-   beam_size = 3 # 柱宽度
-   max_length = 250 # 生成句子的最大长度
-  ```
+    ```python
+    dict_size = 30000 # 字典维度
+    source_dict_dim = dict_size # 源语言字典维度
+    target_dict_dim = dict_size # 目标语言字典维度
+    word_vector_dim = 512 # 词向量维度
+    encoder_size = 512 # 编码器中的GRU隐层大小
+    decoder_size = 512 # 解码器中的GRU隐层大小
+    beam_size = 3 # 柱宽度
+    max_length = 250 # 生成句子的最大长度
+    ```

 2. 其次，实现编码器框架。分为三步：

@@ -251,9 +251,7 @@ is_generating = False

   ```python
    src_embedding = paddle.layer.embedding(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+        input=src_word_id, size=word_vector_dim)
   ```
   - 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到$\mathbf{h}$。

@@ -270,19 +268,22 @@ is_generating = False
   - 对源语言序列编码后的结果（见2的最后一步），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。

   ```python
-   encoded_proj = paddle.layer.mixed(
-       size=decoder_size,
-       input=paddle.layer.full_matrix_projection(encoded_vector))
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
   ```

   - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。

   ```python
   backward_first = paddle.layer.first_seq(input=src_backward)
-   decoder_boot = paddle.layer.mixed(
-       size=decoder_size,
-       act=paddle.activation.Tanh(),
-       input=paddle.layer.full_matrix_projection(backward_first))
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
   ```

   - 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
@@ -302,12 +303,13 @@ is_generating = False
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

-        decoder_inputs = paddle.layer.mixed(
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
            size=decoder_size * 3,
-            input=[
-                paddle.layer.full_matrix_projection(input=context),
-                paddle.layer.full_matrix_projection(input=current_word)
-            ])
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+                error_clipping_threshold=100.0))

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
@@ -327,8 +329,8 @@ is_generating = False

    ```python
    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
    group_inputs = [group_input1, group_input2]
    ```

@@ -343,7 +345,7 @@ is_generating = False
   if not is_generating:
       trg_embedding = paddle.layer.embedding(
           input=paddle.layer.data(
-               name='target_language_word',  
+               name='target_language_word',
               type=paddle.data_type.integer_value_sequence(target_dict_dim)),
           size=word_vector_dim,
           param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
@@ -372,14 +374,13 @@ is_generating = False

   ```python
   if is_generating:
-       # In generation, the decoder predicts a next target word based on
-       # the encoded source sequence and the last generated target word.
+      # In generation, the decoder predicts a next target word based on
+      # the encoded source sequence and the previous generated target word.

-       # The encoded source sequence (encoder's output) must be specified by
-       # StaticInput, which is a read-only memory.
-       # Embedding of the last generated word is automatically gotten by
-       # GeneratedInputs, which is initialized by a start mark, such as <s>,
-       # and must be included in generation.
+      # The encoded source sequence (encoder's output) must be specified by
+      # StaticInput, which is a read-only memory.
+      # Embedding of the previous generated word is automatically retrieved
+      # by GeneratedInputs initialized by a start mark <s>.

       trg_embedding = paddle.layer.GeneratedInput(
           size=target_dict_dim,
@@ -510,36 +511,31 @@ is_generating = False

    ```python
    if is_generating:
-        # get the dictionary
+        # load the dictionary
        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

-        # the delimited element of generated sequences is -1,
-        # the first element of each generated sequence is the sequence length
-        seq_list = []
-        seq = []
-        for w in beam_result[1]:
-            if w != -1:
-                seq.append(w)
-            else:
-                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
-                seq = []
-
-        prob = beam_result[0]
-        for i in xrange(gen_num):
-            print "\n*******************************************************\n"
-            print "src:", ' '.join(
-                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+        gen_sen_idx = np.where(beam_result[1] == -1)[0]
+        assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+        # -1 is the delimiter of generated sequences.
+        # the first element of each generated sequence its length.
+        start_pos, end_pos = 1, 0
+        for i, sample in enumerate(gen_data):
+            print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
            for j in xrange(beam_size):
-                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
+                end_pos = gen_sen_idx[i * beam_size + j]
+                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+                start_pos = end_pos + 2
+            print("\n")
    ```

  生成开始后，可以观察到输出的日志如下：
  ```text
-  src: <s> Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu <e>
-
-  prob = -19.019573: The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
-  prob = -19.113066: The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
-  prob = -19.512890: The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
  ```

 ## 总结

--- a/08.machine_translation/index.html
+++ b/08.machine_translation/index.html
@@ -272,34 +272,32 @@ is_generating = False
   decoder_size = 512 # hidden layer size of GRU in decoder
   beam_size = 3 # expand width in beam search
   max_length = 250 # a stop condition of sequence generation
-  ```
+   ```

 2. Implement Encoder as follows:
   - Input is a sequence of words represented by an integer word index sequence. So we define data layer of data type `integer_value_sequence`. The value range of each element in the sequence is `[0, source_dict_dim)`

   ```python
-    src_word_id = paddle.layer.data(
-        name='source_language_word',
-        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+   src_word_id = paddle.layer.data(
+       name='source_language_word',
+       type=paddle.data_type.integer_value_sequence(source_dict_dim))
   ```

   - Map the one-hot vector (represented by word index) into a word vector $\mathbf{s}$ in a low-dimensional semantic space

   ```python
-    src_embedding = paddle.layer.embedding(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+   src_embedding = paddle.layer.embedding(
+       input=src_word_id, size=word_vector_dim)
   ```

   - Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$

   ```python
-    src_forward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size)
-    src_backward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size, reverse=True)
-    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+   src_forward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size)
+   src_backward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size, reverse=True)
+   encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
   ```

 3. Implement Attention-based Decoder as follows:
@@ -307,19 +305,22 @@ is_generating = False
   - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network

   ```python
-   encoded_proj = paddle.layer.mixed(
-       size=decoder_size,
-       input=paddle.layer.full_matrix_projection(encoded_vector))
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
   ```

   - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$

   ```python
   backward_first = paddle.layer.first_seq(input=src_backward)
-   decoder_boot = paddle.layer.mixed(
-       size=decoder_size,
-       act=paddle.activation.Tanh(),
-       input=paddle.layer.full_matrix_projection(backward_first))
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
   ```

   - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
@@ -340,12 +341,13 @@ is_generating = False
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

-        decoder_inputs = paddle.layer.mixed(
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
            size=decoder_size * 3,
-            input=[
-                paddle.layer.full_matrix_projection(input=context),
-                paddle.layer.full_matrix_projection(input=current_word)
-            ])
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
@@ -353,11 +355,11 @@ is_generating = False
            output_mem=decoder_mem,
            size=decoder_size)

-        out = paddle.layer.mixed(
+        out = paddle.layer.fc(
            size=target_dict_dim,
            bias_attr=True,
            act=paddle.activation.Softmax(),
-            input=paddle.layer.full_matrix_projection(input=gru_step))
+            input=gru_step)
        return out
   ```

@@ -365,8 +367,8 @@ is_generating = False

    ```python
    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
    group_inputs = [group_input1, group_input2]
    ```

@@ -411,13 +413,12 @@ is_generating = False
   ```python
   if is_generating:
       # In generation, the decoder predicts a next target word based on
-       # the encoded source sequence and the last generated target word.
+       # the encoded source sequence and the previous generated target word.

       # The encoded source sequence (encoder's output) must be specified by
       # StaticInput, which is a read-only memory.
-       # Embedding of the last generated word is automatically gotten by
-       # GeneratedInputs, which is initialized by a start mark, such as <s>,
-       # and must be included in generation.
+       # Embedding of the previous generated word is automatically retrieved
+       # by GeneratedInputs initialized by a start mark <s>.

       trg_embedding = paddle.layer.GeneratedInput(
           size=target_dict_dim,
@@ -546,36 +547,31 @@ Note: Our configuration is based on Bahdanau et al. \[[4](#Reference)\] but with

   ```python
   if is_generating:
-        # get the dictionary
-        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-
-        # the delimited element of generated sequences is -1,
-        # the first element of each generated sequence is the sequence length
-        seq_list = []
-        seq = []
-        for w in beam_result[1]:
-            if w != -1:
-                seq.append(w)
-            else:
-                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
-                seq = []
-
-        prob = beam_result[0]
-        for i in xrange(gen_num):
-            print "\n*******************************************************\n"
-            print "src:", ' '.join(
-                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
-            for j in xrange(beam_size):
-                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
+       # load the dictionary
+       src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+       gen_sen_idx = np.where(beam_result[1] == -1)[0]
+       assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+       # -1 is the delimiter of generated sequences.
+       # the first element of each generated sequence its length.
+       start_pos, end_pos = 1, 0
+       for i, sample in enumerate(gen_data):
+           print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
+           for j in xrange(beam_size):
+               end_pos = gen_sen_idx[i * beam_size + j]
+               print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                     trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+               start_pos = end_pos + 2
+           print("\n")
   ```

  The generating log is as follows:
  ```text
-  src: <s> Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu <e>
-
-  prob = -19.019573: The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
-  prob = -19.113066: The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
-  prob = -19.512890: The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
  ```

 ## Summary

--- a/08.machine_translation/train.py
+++ b/08.machine_translation/train.py
 import sys
+import gzip
+import numpy as np

 import paddle.v2 as paddle


-def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
+def save_model(parameters, save_path):
+    with gzip.open(save_path, 'w') as f:
+        parameters.to_tar(f)
+
+
+def seq_to_seq_net(source_dict_dim,
+                   target_dict_dim,
+                   is_generating,
+                   beam_size=3,
+                   max_length=250):
    ### Network Architecture
    word_vector_dim = 512  # dimension of word vector
-    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
-    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
-
-    beam_size = 3
-    max_length = 250
+    decoder_size = 512  # dimension of hidden unit of GRU decoder
+    encoder_size = 512  # dimension of hidden unit of GRU encoder

    #### Encoder
    src_word_id = paddle.layer.data(
        name='source_language_word',
        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    src_embedding = paddle.layer.embedding(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+        input=src_word_id, size=word_vector_dim)
    src_forward = paddle.networks.simple_gru(
        input=src_embedding, size=encoder_size)
    src_backward = paddle.networks.simple_gru(
@@ -27,16 +33,19 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    #### Decoder
-    encoded_proj = paddle.layer.mixed(
+    encoded_proj = paddle.layer.fc(
+        act=paddle.activation.Linear(),
        size=decoder_size,
-        input=paddle.layer.full_matrix_projection(encoded_vector))
+        bias_attr=False,
+        input=encoded_vector)

    backward_first = paddle.layer.first_seq(input=src_backward)

-    decoder_boot = paddle.layer.mixed(
+    decoder_boot = paddle.layer.fc(
        size=decoder_size,
        act=paddle.activation.Tanh(),
-        input=paddle.layer.full_matrix_projection(backward_first))
+        bias_attr=False,
+        input=backward_first)

    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):

@@ -48,12 +57,13 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

-        decoder_inputs = paddle.layer.mixed(
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
            size=decoder_size * 3,
-            input=[
-                paddle.layer.full_matrix_projection(input=context),
-                paddle.layer.full_matrix_projection(input=current_word)
-            ])
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+                error_clipping_threshold=100.0))

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
@@ -61,16 +71,16 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
            output_mem=decoder_mem,
            size=decoder_size)

-        out = paddle.layer.mixed(
+        out = paddle.layer.fc(
            size=target_dict_dim,
            bias_attr=True,
            act=paddle.activation.Softmax(),
-            input=paddle.layer.full_matrix_projection(input=gru_step))
+            input=gru_step)
        return out

-    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    decoder_group_name = 'decoder_group'
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
    group_inputs = [group_input1, group_input2]

    if not is_generating:
@@ -100,13 +110,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
        return cost
    else:
        # In generation, the decoder predicts a next target word based on
-        # the encoded source sequence and the last generated target word.
+        # the encoded source sequence and the previous generated target word.

        # The encoded source sequence (encoder's output) must be specified by
        # StaticInput, which is a read-only memory.
-        # Embedding of the last generated word is automatically gotten by
-        # GeneratedInputs, which is initialized by a start mark, such as <s>,
-        # and must be included in generation.
+        # Embedding of the previous generated word is automatically retrieved
+        # by GeneratedInputs initialized by a start mark <s>.

        trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
@@ -136,32 +145,43 @@ def main():

    # train the network
    if not is_generating:
-        cost = seqToseq_net(source_dict_dim, target_dict_dim)
-        parameters = paddle.parameters.create(cost)
-
        # define optimize method and trainer
        optimizer = paddle.optimizer.Adam(
            learning_rate=5e-5,
            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+        cost = seq_to_seq_net(source_dict_dim, target_dict_dim, is_generating)
+        parameters = paddle.parameters.create(cost)
+
        trainer = paddle.trainer.SGD(
            cost=cost, parameters=parameters, update_equation=optimizer)
        # define data reader
        wmt14_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
-            batch_size=5)
+            batch_size=4)

        # define event_handler callback
        def event_handler(event):
            if isinstance(event, paddle.event.EndIteration):
                if event.batch_id % 10 == 0:
-                    print "\nPass %d, Batch %d, Cost %f, %s" % (
-                        event.pass_id, event.batch_id, event.cost,
-                        event.metrics)
+                    print("\nPass %d, Batch %d, Cost %f, %s" %
+                          (event.pass_id, event.batch_id, event.cost,
+                           event.metrics))
                else:
                    sys.stdout.write('.')
                    sys.stdout.flush()

+                if not event.batch_id % 10:
+                    save_path = 'params_pass_%05d_batch_%05d.tar.gz' % (
+                        event.pass_id, event.batch_id)
+                    save_model(parameters, save_path)
+
+            if isinstance(event, paddle.event.EndPass):
+                # save parameters
+                save_path = 'params_pass_%05d.tar.gz' % (event.pass_id)
+                save_model(parameters, save_path)
+
        # start to train
        trainer.train(
            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
@@ -169,17 +189,20 @@ def main():
    # generate a english sequence to french
    else:
        # use the first 3 samples for generation
-        gen_creator = paddle.dataset.wmt14.gen(dict_size)
        gen_data = []
        gen_num = 3
-        for item in gen_creator():
-            gen_data.append((item[0], ))
+        for item in paddle.dataset.wmt14.gen(dict_size)():
+            gen_data.append([item[0]])
            if len(gen_data) == gen_num:
                break

-        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
-        # get the pretrained model, whose bleu = 26.92
+        beam_size = 3
+        beam_gen = seq_to_seq_net(source_dict_dim, target_dict_dim,
+                                  is_generating, beam_size)
+
+        # get the trained model, whose bleu = 26.92
        parameters = paddle.dataset.wmt14.model()
+
        # prob is the prediction probabilities, and id is the prediction word.
        beam_result = paddle.infer(
            output_layer=beam_gen,
@@ -187,28 +210,25 @@ def main():
            input=gen_data,
            field=['prob', 'id'])

-        # get the dictionary
+        # load the dictionary
        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

-        # the delimited element of generated sequences is -1,
-        # the first element of each generated sequence is the sequence length
-        seq_list = []
-        seq = []
-        for w in beam_result[1]:
-            if w != -1:
-                seq.append(w)
-            else:
-                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
-                seq = []
-
-        prob = beam_result[0]
-        beam_size = 3
-        for i in xrange(gen_num):
-            print "\n*******************************************************\n"
-            print "src:", ' '.join(
-                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+        gen_sen_idx = np.where(beam_result[1] == -1)[0]
+        assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+        # -1 is the delimiter of generated sequences.
+        # the first element of each generated sequence its length.
+        start_pos, end_pos = 1, 0
+        for i, sample in enumerate(gen_data):
+            print(
+                " ".join([src_dict[w] for w in sample[0][1:-1]])
+            )  # skip the start and ending mark when printing the source sentence
            for j in xrange(beam_size):
-                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
+                end_pos = gen_sen_idx[i * beam_size + j]
+                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+                start_pos = end_pos + 2
+            print("\n")


 if __name__ == '__main__':