diff --git a/image_classification/README.md b/image_classification/README.md
index 37aa1f06a9484e2901d80b4aaba6054db923485d..843d683c004865489579f684cda15218e5c77379 100644
--- a/image_classification/README.md
+++ b/image_classification/README.md
@@ -235,4 +235,4 @@ parameters.init_from_tar(gzip.open('Paddle_ResNet50.tar.gz', 'r'))
 ```
 
 ### 注意事项
-模型压缩包中所含各文件的文件名对应了和模型配置中的参数名一一对应，是加载模型参数的依据。我们提供的预训练模型均使用了示例代码中的配置，如需修改网络配置，请多加注意，需要保证网络配置中的参数名和压缩包中的文件名能够正确对应。
+模型压缩包中所含各文件的文件名和模型配置中的参数名一一对应，是加载模型参数的依据。我们提供的预训练模型均使用了示例代码中的配置，如需修改网络配置，请多加注意，需要保证网络配置中的参数名和压缩包中的文件名能够正确对应。
diff --git a/image_classification/index.html b/image_classification/index.html
index 33e55cebd7e939ad66d8cb4b8b2b1168c338e9ed..48009093f9505fa425890d3103bf0c8e21073b63 100644
--- a/image_classification/index.html
+++ b/image_classification/index.html
@@ -277,7 +277,7 @@ parameters.init_from_tar(gzip.open('Paddle_ResNet50.tar.gz', 'r'))
 ```
 
 ### 注意事项
-模型压缩包中所含各文件的文件名对应了和模型配置中的参数名一一对应，是加载模型参数的依据。我们提供的预训练模型均使用了示例代码中的配置，如需修改网络配置，请多加注意，需要保证网络配置中的参数名和压缩包中的文件名能够正确对应。
+模型压缩包中所含各文件的文件名和模型配置中的参数名一一对应，是加载模型参数的依据。我们提供的预训练模型均使用了示例代码中的配置，如需修改网络配置，请多加注意，需要保证网络配置中的参数名和压缩包中的文件名能够正确对应。
 
 </div>
 <!-- You can change the lines below now. -->
diff --git a/nested_sequence/text_classification/.gitignore b/nested_sequence/text_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dde3895fc112ad34a839b2fed9210ac2288a959b
--- /dev/null
+++ b/nested_sequence/text_classification/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+*.pyc
diff --git a/nested_sequence/text_classification/README.md b/nested_sequence/text_classification/README.md
index dbc1b4a5d31a24e6279ccc1cebaa22583c4d77d7..5ba48d65dfbc371fcd7af21ab9b063de8f47cc57 100644
--- a/nested_sequence/text_classification/README.md
+++ b/nested_sequence/text_classification/README.md
@@ -26,7 +26,7 @@ PaddlePaddle 实现该网络结构的代码见 `network_conf.py`。
 ``` python
 nest_group = paddle.layer.recurrent_group(input=[paddle.layer.SubsequenceInput(emb),
                                                  hidden_size],
-                                         step=cnn_cov_group)
+                                          step=cnn_cov_group)
 ```
 
 
@@ -40,10 +40,10 @@ CNN网络具体代码实现如下：
 ```python
 def cnn_cov_group(group_input, hidden_size):
     """
-    Covolution group definition
+    Convolution group definition.
     :param group_input: The input of this layer.
     :type group_input: LayerOutput
-    :params hidden_size: Size of FC layer.
+    :params hidden_size: The size of the fully connected layer.
     :type hidden_size: int
     """
     conv3 = paddle.networks.sequence_conv_pool(
@@ -63,11 +63,13 @@ PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块：`pad
 
 在得到每个句子的表示向量之后， 将所有句子表示向量经过一个平均池化层， 得到一个样本的向量表示， 向量经过一个全连接层输出最终的预测结果。 代码如下：
 ```python
-avg_pool = paddle.layer.pooling(input=nest_group, pooling_type=paddle.pooling.Avg(),
-                                    agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE)  
+avg_pool = paddle.layer.pooling(input=nest_group,
+                                pooling_type=paddle.pooling.Avg(),
+                                agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE)
+
 prob = paddle.layer.mixed(size=class_num,
-                                     input=[paddle.layer.full_matrix_projection(input=avg_pool)],
-                                       act=paddle.activation.Softmax())
+                          input=[paddle.layer.full_matrix_projection(input=avg_pool)],
+                          act=paddle.activation.Softmax())
 ```
 ## 安装依赖包
 ```bash
@@ -122,10 +124,10 @@ python infer.py --model_path 'models/params_pass_00000.tar.gz'
 
 输入数据格式如下：每一行为一条样本，以 `\t` 分隔，第一列是类别标签，第二列是输入文本的内容。以下是两条示例数据：
 
-    ```
-    positive        This movie is very good. The actor is so handsome.
-    negative        What a terrible movie. I waste so much time.
-    ```
+```
+positive        This movie is very good. The actor is so handsome.
+negative        What a terrible movie. I waste so much time.
+```
 
 2.编写数据读取接口
 
diff --git a/nested_sequence/text_classification/index.html b/nested_sequence/text_classification/index.html
index 005de9249ac5d3b08e9e0537965bae946b623dec..e465343f4bd692bea1fa33adcb36f5e4da163edc 100644
--- a/nested_sequence/text_classification/index.html
+++ b/nested_sequence/text_classification/index.html
@@ -68,7 +68,7 @@ PaddlePaddle 实现该网络结构的代码见 `network_conf.py`。
 ``` python
 nest_group = paddle.layer.recurrent_group(input=[paddle.layer.SubsequenceInput(emb),
                                                  hidden_size],
-                                         step=cnn_cov_group)
+                                          step=cnn_cov_group)
 ```
 
 
@@ -82,10 +82,10 @@ CNN网络具体代码实现如下：
 ```python
 def cnn_cov_group(group_input, hidden_size):
     """
-    Covolution group definition
+    Convolution group definition.
     :param group_input: The input of this layer.
     :type group_input: LayerOutput
-    :params hidden_size: Size of FC layer.
+    :params hidden_size: The size of the fully connected layer.
     :type hidden_size: int
     """
     conv3 = paddle.networks.sequence_conv_pool(
@@ -105,11 +105,13 @@ PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块：`pad
 
 在得到每个句子的表示向量之后， 将所有句子表示向量经过一个平均池化层， 得到一个样本的向量表示， 向量经过一个全连接层输出最终的预测结果。 代码如下：
 ```python
-avg_pool = paddle.layer.pooling(input=nest_group, pooling_type=paddle.pooling.Avg(),
-                                    agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE)  
+avg_pool = paddle.layer.pooling(input=nest_group,
+                                pooling_type=paddle.pooling.Avg(),
+                                agg_level=paddle.layer.AggregateLevel.TO_NO_SEQUENCE)
+
 prob = paddle.layer.mixed(size=class_num,
-                                     input=[paddle.layer.full_matrix_projection(input=avg_pool)],
-                                       act=paddle.activation.Softmax())
+                          input=[paddle.layer.full_matrix_projection(input=avg_pool)],
+                          act=paddle.activation.Softmax())
 ```
 ## 安装依赖包
 ```bash
@@ -164,10 +166,10 @@ python infer.py --model_path 'models/params_pass_00000.tar.gz'
 
 输入数据格式如下：每一行为一条样本，以 `\t` 分隔，第一列是类别标签，第二列是输入文本的内容。以下是两条示例数据：
 
-    ```
-    positive        This movie is very good. The actor is so handsome.
-    negative        What a terrible movie. I waste so much time.
-    ```
+```
+positive        This movie is very good. The actor is so handsome.
+negative        What a terrible movie. I waste so much time.
+```
 
 2.编写数据读取接口
 
diff --git a/scheduled_sampling/README.md b/scheduled_sampling/README.md
index 644c1b960cb7a153a69b4c4f282008372194943c..016f040e8986bccab0043dfc4579fedf17bb2bba 100644
--- a/scheduled_sampling/README.md
+++ b/scheduled_sampling/README.md
@@ -60,52 +60,52 @@ class RandomScheduleGenerator:
 `__init__`方法对类进行初始化，其`schedule_type`参数指定了使用哪种衰减方式，可选的方式有`constant`、`linear`、`exponential`和`inverse_sigmoid`。`constant`指对所有的mini-batch使用固定的$\epsilon_i$，`linear`指线性衰减方式，`exponential`表示指数衰减方式，`inverse_sigmoid`表示反向Sigmoid衰减。`__init__`方法的参数`a`和`b`表示衰减方法的参数，需要在验证集上调优。`self.schedule_computers`将衰减方式映射为计算$\epsilon_i$的函数。最后一行根据`schedule_type`将选择的衰减函数赋给`self.schedule_computer`变量。
 
 ```python
-    def __init__(self, schedule_type, a, b):
-        """
-        schduled_type: is the type of the decay. It supports constant, linear,
-        exponential, and inverse_sigmoid right now.
-        a: parameter of the decay (MUST BE DOUBLE)
-        b: parameter of the decay (MUST BE DOUBLE)
-        """
-        self.schedule_type = schedule_type
-        self.a = a
-        self.b = b
-        self.data_processed_ = 0
-        self.schedule_computers = {
-            "constant": lambda a, b, d: a,
-            "linear": lambda a, b, d: max(a, 1 - d / b),
-            "exponential": lambda a, b, d: pow(a, d / b),
-            "inverse_sigmoid": lambda a, b, d: b / (b + math.exp(d * a / b)),
-        }
-        assert (self.schedule_type in self.schedule_computers)
-        self.schedule_computer = self.schedule_computers[self.schedule_type]
+def __init__(self, schedule_type, a, b):
+    """
+    schduled_type: is the type of the decay. It supports constant, linear,
+    exponential, and inverse_sigmoid right now.
+    a: parameter of the decay (MUST BE DOUBLE)
+    b: parameter of the decay (MUST BE DOUBLE)
+    """
+    self.schedule_type = schedule_type
+    self.a = a
+    self.b = b
+    self.data_processed_ = 0
+    self.schedule_computers = {
+        "constant": lambda a, b, d: a,
+        "linear": lambda a, b, d: max(a, 1 - d / b),
+        "exponential": lambda a, b, d: pow(a, d / b),
+        "inverse_sigmoid": lambda a, b, d: b / (b + math.exp(d * a / b)),
+    }
+    assert (self.schedule_type in self.schedule_computers)
+    self.schedule_computer = self.schedule_computers[self.schedule_type]
 ```
 
 `getScheduleRate`根据衰减函数和已经处理的数据量计算$\epsilon_i$。
 
 ```python
-    def getScheduleRate(self):
-        """
-        Get the schedule sampling rate. Usually not needed to be called by the users
-        """
-        return self.schedule_computer(self.a, self.b, self.data_processed_)
+def getScheduleRate(self):
+    """
+    Get the schedule sampling rate. Usually not needed to be called by the users
+    """
+    return self.schedule_computer(self.a, self.b, self.data_processed_)
 
 ```
 
 `processBatch`方法根据概率值$\epsilon_i$进行采样，得到`indexes`，`indexes`中每个元素取值为`0`的概率为$\epsilon_i$，取值为`1`的概率为$1-\epsilon_i$。`indexes`决定了解码器的输入是真实元素还是生成的元素，取值为`0`表示使用真实元素，取值为`1`表示使用生成的元素。
 
 ```python
-    def processBatch(self, batch_size):
-        """
-        Get a batch_size of sampled indexes. These indexes can be passed to a
-        MultiplexLayer to select from the grouth truth and generated samples
-        from the last time step.
-        """
-        rate = self.getScheduleRate()
-        numbers = np.random.rand(batch_size)
-        indexes = (numbers >= rate).astype('int32').tolist()
-        self.data_processed_ += batch_size
-        return indexes
+def processBatch(self, batch_size):
+    """
+    Get a batch_size of sampled indexes. These indexes can be passed to a
+    MultiplexLayer to select from the grouth truth and generated samples
+    from the last time step.
+    """
+    rate = self.getScheduleRate()
+    numbers = np.random.rand(batch_size)
+    indexes = (numbers >= rate).astype('int32').tolist()
+    self.data_processed_ += batch_size
+    return indexes
 ```
 
 Scheduled Sampling需要在序列到序列模型的基础上增加一个输入`true_token_flag`，以控制解码器输入。
@@ -148,62 +148,62 @@ def gen_schedule_data(reader):
 训练时`recurrent_group`每一步调用的解码器函数如下：
 
 ```python
-    def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
-                                         true_token_flag):
-        """
-        The decoder step for training.
-        :param enc_vec: the encoder vector for attention
-        :type enc_vec: LayerOutput
-        :param enc_proj: the encoder projection for attention
-        :type enc_proj: LayerOutput
-        :param true_word: the ground-truth target word
-        :type true_word: LayerOutput
-        :param true_token_flag: the flag of using the ground-truth target word
-        :type true_token_flag: LayerOutput
-        :return: the softmax output layer
-        :rtype: LayerOutput
-        """
-
-        decoder_mem = paddle.layer.memory(
-            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-        context = paddle.networks.simple_attention(
-            encoded_sequence=enc_vec,
-            encoded_proj=enc_proj,
-            decoder_state=decoder_mem)
-
-        gru_out_memory = paddle.layer.memory(
-            name='gru_out', size=target_dict_dim)
-
-        generated_word = paddle.layer.max_id(input=gru_out_memory)
-
-        generated_word_emb = paddle.layer.embedding(
-            input=generated_word,
-            size=word_vector_dim,
-            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
-
-        current_word = paddle.layer.multiplex(
-            input=[true_token_flag, true_word, generated_word_emb])
-
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
-
-        gru_step = paddle.layer.gru_step(
-            name='gru_decoder',
-            input=decoder_inputs,
-            output_mem=decoder_mem,
-            size=decoder_size)
-
-        with paddle.layer.mixed(
-                name='gru_out',
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
-
-        return out
+def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
+                                     true_token_flag):
+    """
+    The decoder step for training.
+    :param enc_vec: the encoder vector for attention
+    :type enc_vec: LayerOutput
+    :param enc_proj: the encoder projection for attention
+    :type enc_proj: LayerOutput
+    :param true_word: the ground-truth target word
+    :type true_word: LayerOutput
+    :param true_token_flag: the flag of using the ground-truth target word
+    :type true_token_flag: LayerOutput
+    :return: the softmax output layer
+    :rtype: LayerOutput
+    """
+
+    decoder_mem = paddle.layer.memory(
+        name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+    context = paddle.networks.simple_attention(
+        encoded_sequence=enc_vec,
+        encoded_proj=enc_proj,
+        decoder_state=decoder_mem)
+
+    gru_out_memory = paddle.layer.memory(
+        name='gru_out', size=target_dict_dim)
+
+    generated_word = paddle.layer.max_id(input=gru_out_memory)
+
+    generated_word_emb = paddle.layer.embedding(
+        input=generated_word,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+
+    current_word = paddle.layer.multiplex(
+        input=[true_token_flag, true_word, generated_word_emb])
+
+    with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+        decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+        decoder_inputs += paddle.layer.full_matrix_projection(
+            input=current_word)
+
+    gru_step = paddle.layer.gru_step(
+        name='gru_decoder',
+        input=decoder_inputs,
+        output_mem=decoder_mem,
+        size=decoder_size)
+
+    with paddle.layer.mixed(
+            name='gru_out',
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax()) as out:
+        out += paddle.layer.full_matrix_projection(input=gru_step)
+
+    return out
 ```
 
 该函数使用`memory`层`gru_out_memory`记忆上一时刻生成的元素，根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择，并将选择的结果作为解码器输入。`multiplex`层使用了三个输入，分别为`true_token_flag`、`true_word`和`generated_word_emb`。对于这三个输入中每个元素，若`true_token_flag`中的值为`0`，则`multiplex`层输出`true_word`中的相应元素；若`true_token_flag`中的值为`1`，则`multiplex`层输出`generated_word_emb`中的相应元素。
diff --git a/scheduled_sampling/index.html b/scheduled_sampling/index.html
index 224f598126cfbe477058a7af45a30dd9d4c8764f..8d327c4b1ad687ef18ca81302cf6dccf049cab51 100644
--- a/scheduled_sampling/index.html
+++ b/scheduled_sampling/index.html
@@ -102,52 +102,52 @@ class RandomScheduleGenerator:
 `__init__`方法对类进行初始化，其`schedule_type`参数指定了使用哪种衰减方式，可选的方式有`constant`、`linear`、`exponential`和`inverse_sigmoid`。`constant`指对所有的mini-batch使用固定的$\epsilon_i$，`linear`指线性衰减方式，`exponential`表示指数衰减方式，`inverse_sigmoid`表示反向Sigmoid衰减。`__init__`方法的参数`a`和`b`表示衰减方法的参数，需要在验证集上调优。`self.schedule_computers`将衰减方式映射为计算$\epsilon_i$的函数。最后一行根据`schedule_type`将选择的衰减函数赋给`self.schedule_computer`变量。
 
 ```python
-    def __init__(self, schedule_type, a, b):
-        """
-        schduled_type: is the type of the decay. It supports constant, linear,
-        exponential, and inverse_sigmoid right now.
-        a: parameter of the decay (MUST BE DOUBLE)
-        b: parameter of the decay (MUST BE DOUBLE)
-        """
-        self.schedule_type = schedule_type
-        self.a = a
-        self.b = b
-        self.data_processed_ = 0
-        self.schedule_computers = {
-            "constant": lambda a, b, d: a,
-            "linear": lambda a, b, d: max(a, 1 - d / b),
-            "exponential": lambda a, b, d: pow(a, d / b),
-            "inverse_sigmoid": lambda a, b, d: b / (b + math.exp(d * a / b)),
-        }
-        assert (self.schedule_type in self.schedule_computers)
-        self.schedule_computer = self.schedule_computers[self.schedule_type]
+def __init__(self, schedule_type, a, b):
+    """
+    schduled_type: is the type of the decay. It supports constant, linear,
+    exponential, and inverse_sigmoid right now.
+    a: parameter of the decay (MUST BE DOUBLE)
+    b: parameter of the decay (MUST BE DOUBLE)
+    """
+    self.schedule_type = schedule_type
+    self.a = a
+    self.b = b
+    self.data_processed_ = 0
+    self.schedule_computers = {
+        "constant": lambda a, b, d: a,
+        "linear": lambda a, b, d: max(a, 1 - d / b),
+        "exponential": lambda a, b, d: pow(a, d / b),
+        "inverse_sigmoid": lambda a, b, d: b / (b + math.exp(d * a / b)),
+    }
+    assert (self.schedule_type in self.schedule_computers)
+    self.schedule_computer = self.schedule_computers[self.schedule_type]
 ```
 
 `getScheduleRate`根据衰减函数和已经处理的数据量计算$\epsilon_i$。
 
 ```python
-    def getScheduleRate(self):
-        """
-        Get the schedule sampling rate. Usually not needed to be called by the users
-        """
-        return self.schedule_computer(self.a, self.b, self.data_processed_)
+def getScheduleRate(self):
+    """
+    Get the schedule sampling rate. Usually not needed to be called by the users
+    """
+    return self.schedule_computer(self.a, self.b, self.data_processed_)
 
 ```
 
 `processBatch`方法根据概率值$\epsilon_i$进行采样，得到`indexes`，`indexes`中每个元素取值为`0`的概率为$\epsilon_i$，取值为`1`的概率为$1-\epsilon_i$。`indexes`决定了解码器的输入是真实元素还是生成的元素，取值为`0`表示使用真实元素，取值为`1`表示使用生成的元素。
 
 ```python
-    def processBatch(self, batch_size):
-        """
-        Get a batch_size of sampled indexes. These indexes can be passed to a
-        MultiplexLayer to select from the grouth truth and generated samples
-        from the last time step.
-        """
-        rate = self.getScheduleRate()
-        numbers = np.random.rand(batch_size)
-        indexes = (numbers >= rate).astype('int32').tolist()
-        self.data_processed_ += batch_size
-        return indexes
+def processBatch(self, batch_size):
+    """
+    Get a batch_size of sampled indexes. These indexes can be passed to a
+    MultiplexLayer to select from the grouth truth and generated samples
+    from the last time step.
+    """
+    rate = self.getScheduleRate()
+    numbers = np.random.rand(batch_size)
+    indexes = (numbers >= rate).astype('int32').tolist()
+    self.data_processed_ += batch_size
+    return indexes
 ```
 
 Scheduled Sampling需要在序列到序列模型的基础上增加一个输入`true_token_flag`，以控制解码器输入。
@@ -190,62 +190,62 @@ def gen_schedule_data(reader):
 训练时`recurrent_group`每一步调用的解码器函数如下：
 
 ```python
-    def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
-                                         true_token_flag):
-        """
-        The decoder step for training.
-        :param enc_vec: the encoder vector for attention
-        :type enc_vec: LayerOutput
-        :param enc_proj: the encoder projection for attention
-        :type enc_proj: LayerOutput
-        :param true_word: the ground-truth target word
-        :type true_word: LayerOutput
-        :param true_token_flag: the flag of using the ground-truth target word
-        :type true_token_flag: LayerOutput
-        :return: the softmax output layer
-        :rtype: LayerOutput
-        """
-
-        decoder_mem = paddle.layer.memory(
-            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-        context = paddle.networks.simple_attention(
-            encoded_sequence=enc_vec,
-            encoded_proj=enc_proj,
-            decoder_state=decoder_mem)
-
-        gru_out_memory = paddle.layer.memory(
-            name='gru_out', size=target_dict_dim)
-
-        generated_word = paddle.layer.max_id(input=gru_out_memory)
-
-        generated_word_emb = paddle.layer.embedding(
-            input=generated_word,
-            size=word_vector_dim,
-            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
-
-        current_word = paddle.layer.multiplex(
-            input=[true_token_flag, true_word, generated_word_emb])
-
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
-
-        gru_step = paddle.layer.gru_step(
-            name='gru_decoder',
-            input=decoder_inputs,
-            output_mem=decoder_mem,
-            size=decoder_size)
-
-        with paddle.layer.mixed(
-                name='gru_out',
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
-
-        return out
+def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
+                                     true_token_flag):
+    """
+    The decoder step for training.
+    :param enc_vec: the encoder vector for attention
+    :type enc_vec: LayerOutput
+    :param enc_proj: the encoder projection for attention
+    :type enc_proj: LayerOutput
+    :param true_word: the ground-truth target word
+    :type true_word: LayerOutput
+    :param true_token_flag: the flag of using the ground-truth target word
+    :type true_token_flag: LayerOutput
+    :return: the softmax output layer
+    :rtype: LayerOutput
+    """
+
+    decoder_mem = paddle.layer.memory(
+        name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+    context = paddle.networks.simple_attention(
+        encoded_sequence=enc_vec,
+        encoded_proj=enc_proj,
+        decoder_state=decoder_mem)
+
+    gru_out_memory = paddle.layer.memory(
+        name='gru_out', size=target_dict_dim)
+
+    generated_word = paddle.layer.max_id(input=gru_out_memory)
+
+    generated_word_emb = paddle.layer.embedding(
+        input=generated_word,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+
+    current_word = paddle.layer.multiplex(
+        input=[true_token_flag, true_word, generated_word_emb])
+
+    with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+        decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+        decoder_inputs += paddle.layer.full_matrix_projection(
+            input=current_word)
+
+    gru_step = paddle.layer.gru_step(
+        name='gru_decoder',
+        input=decoder_inputs,
+        output_mem=decoder_mem,
+        size=decoder_size)
+
+    with paddle.layer.mixed(
+            name='gru_out',
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax()) as out:
+        out += paddle.layer.full_matrix_projection(input=gru_step)
+
+    return out
 ```
 
 该函数使用`memory`层`gru_out_memory`记忆上一时刻生成的元素，根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择，并将选择的结果作为解码器输入。`multiplex`层使用了三个输入，分别为`true_token_flag`、`true_word`和`generated_word_emb`。对于这三个输入中每个元素，若`true_token_flag`中的值为`0`，则`multiplex`层输出`true_word`中的相应元素；若`true_token_flag`中的值为`1`，则`multiplex`层输出`generated_word_emb`中的相应元素。