diff --git a/label_semantic_roles/README.en.md b/label_semantic_roles/README.en.md
index d43b8c931e6c4e68a3de115d177edfbcc6df57ee..fe46ee2a98020bcf0ce581726d10e4d8c74962a6 100644
--- a/label_semantic_roles/README.en.md
+++ b/label_semantic_roles/README.en.md
@@ -134,7 +134,7 @@ After modification, the model is as follows:
 
 
 <div  align="center">  
-<img src="image/db_lstm_en.png" width = "60%"  align=center /><br>
+<img src="image/db_lstm_network_en.png" width = "60%"  align=center /><br>
 Fig 6. DB-LSTM for SRL tasks
 </div>
 
@@ -212,7 +212,7 @@ print pred_len
 
 ## Model configuration
 
-- 1. Define input data dimensions and model hyperparameters.
+- Define input data dimensions and model hyperparameters.
 
 ```python
 mark_dict_len = 2    # Value range of region mark. Region mark is either 0 or 1, so range is 2
@@ -247,7 +247,7 @@ target = paddle.layer.data(name='target', type=d_type(label_dict_len))
 
 Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
 
-- 2. The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences.
+- The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences.
 
 ```python  
 
@@ -276,7 +276,7 @@ emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)
 ```
 
-- 3. 8 LSTM units will be trained in "forward / backward" order.
+- 8 LSTM units will be trained in "forward / backward" order.
 
 ```python  
 hidden_0 = paddle.layer.mixed(
@@ -326,7 +326,7 @@ for i in range(1, depth):
     input_tmp = [mix_hidden, lstm]
 ```
 
-- 4. We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation.
+- We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation.
 
  ```python
  feature_out = paddle.layer.mixed(
@@ -340,7 +340,7 @@ for i in range(1, depth):
  ], )
  ```
 
-- 5.  We use CRF as cost function, the parameter of CRF cost will be named `crfw`.
+- We use CRF as cost function, the parameter of CRF cost will be named `crfw`.
 
 ```python
 crf_cost = paddle.layer.crf(
@@ -353,7 +353,7 @@ crf_cost = paddle.layer.crf(
         learning_rate=mix_hidden_lr))
 ```
 
-- 6.  CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer.  The sharing of parameters among multiple layers is specified by the same parameter name in these layers.
+- CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer.  The sharing of parameters among multiple layers is specified by the same parameter name in these layers.
 
 ```python
 crf_dec = paddle.layer.crf_decoding(
diff --git a/label_semantic_roles/README.md b/label_semantic_roles/README.md
index 9fcf1ae84c3f83d68f0e57be320928ff83000e67..e0413c3f6d17e3ee173dad86e4863e710dfe08ba 100644
--- a/label_semantic_roles/README.md
+++ b/label_semantic_roles/README.md
@@ -206,7 +206,7 @@ print pred_len
 
 ## 模型配置说明
 
-- 1. 定义输入数据维度及模型超参数。
+- 定义输入数据维度及模型超参数。
 
 ```python
 mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
@@ -240,7 +240,7 @@ target = paddle.layer.data(name='target', type=d_type(label_dict_len))
 
 这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
 
-- 2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
+- 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
 
 ```python  
 
@@ -269,7 +269,7 @@ emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)
 ```
 
-- 3. 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
 
 ```python  
 hidden_0 = paddle.layer.mixed(
@@ -319,7 +319,7 @@ for i in range(1, depth):
     input_tmp = [mix_hidden, lstm]
 ```
 
-- 4. 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
+- 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
 
 ```python
 feature_out = paddle.layer.mixed(
@@ -333,7 +333,7 @@ input=[
 ], )
 ```
 
-- 5. 网络的末端定义CRF层计算损失(cost)，指定参数名字为 `crfw`，该层需要输入正确的数据标签(target)。
+- 网络的末端定义CRF层计算损失(cost)，指定参数名字为 `crfw`，该层需要输入正确的数据标签(target)。
 
 ```python
 crf_cost = paddle.layer.crf(
@@ -346,7 +346,7 @@ crf_cost = paddle.layer.crf(
         learning_rate=mix_hidden_lr))
 ```
 
-- 6. CRF译码层和CRF层参数名字相同，即共享权重。如果输入了正确的数据标签(target)，会统计错误标签的个数，可以用来评估模型。如果没有输入正确的数据标签，该层可以推到出最优解，可以用来预测模型。
+- CRF译码层和CRF层参数名字相同，即共享权重。如果输入了正确的数据标签(target)，会统计错误标签的个数，可以用来评估模型。如果没有输入正确的数据标签，该层可以推到出最优解，可以用来预测模型。
 
 ```python
 crf_dec = paddle.layer.crf_decoding(
diff --git a/label_semantic_roles/image/bd_lstm_en.png b/label_semantic_roles/image/db_lstm_network_en.png
similarity index 100%
rename from label_semantic_roles/image/bd_lstm_en.png
rename to label_semantic_roles/image/db_lstm_network_en.png
diff --git a/label_semantic_roles/index.en.html b/label_semantic_roles/index.en.html
index 2bc090b5bf507ce5fe74d31e0aeb2d2f3f5daf67..29dea0eef47938826a6e66c52dc5724a6612eb8d 100644
--- a/label_semantic_roles/index.en.html
+++ b/label_semantic_roles/index.en.html
@@ -176,7 +176,7 @@ After modification, the model is as follows:
 
 
 <div  align="center">  
-<img src="image/db_lstm_en.png" width = "60%"  align=center /><br>
+<img src="image/db_lstm_network_en.png" width = "60%"  align=center /><br>
 Fig 6. DB-LSTM for SRL tasks
 </div>
 
@@ -254,7 +254,7 @@ print pred_len
 
 ## Model configuration
 
-- 1. Define input data dimensions and model hyperparameters.
+- Define input data dimensions and model hyperparameters.
 
 ```python
 mark_dict_len = 2    # Value range of region mark. Region mark is either 0 or 1, so range is 2
@@ -289,7 +289,7 @@ target = paddle.layer.data(name='target', type=d_type(label_dict_len))
 
 Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
 
-- 2. The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences.
+- The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences.
 
 ```python  
 
@@ -318,7 +318,7 @@ emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)
 ```
 
-- 3. 8 LSTM units will be trained in "forward / backward" order.
+- 8 LSTM units will be trained in "forward / backward" order.
 
 ```python  
 hidden_0 = paddle.layer.mixed(
@@ -368,7 +368,7 @@ for i in range(1, depth):
     input_tmp = [mix_hidden, lstm]
 ```
 
-- 4. We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation.
+- We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation.
 
  ```python
  feature_out = paddle.layer.mixed(
@@ -382,7 +382,7 @@ for i in range(1, depth):
  ], )
  ```
 
-- 5.  We use CRF as cost function, the parameter of CRF cost will be named `crfw`.
+- We use CRF as cost function, the parameter of CRF cost will be named `crfw`.
 
 ```python
 crf_cost = paddle.layer.crf(
@@ -395,7 +395,7 @@ crf_cost = paddle.layer.crf(
         learning_rate=mix_hidden_lr))
 ```
 
-- 6.  CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer.  The sharing of parameters among multiple layers is specified by the same parameter name in these layers.
+- CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer.  The sharing of parameters among multiple layers is specified by the same parameter name in these layers.
 
 ```python
 crf_dec = paddle.layer.crf_decoding(
diff --git a/label_semantic_roles/index.html b/label_semantic_roles/index.html
index 7f93944e57a4a76e08a83b855cffe072d3dc55cd..ed6f21dcbcffcc71a06a12963366e4e8b69669d4 100644
--- a/label_semantic_roles/index.html
+++ b/label_semantic_roles/index.html
@@ -248,7 +248,7 @@ print pred_len
 
 ## 模型配置说明
 
-- 1. 定义输入数据维度及模型超参数。
+- 定义输入数据维度及模型超参数。
 
 ```python
 mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
@@ -282,7 +282,7 @@ target = paddle.layer.data(name='target', type=d_type(label_dict_len))
 
 这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
 
-- 2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
+- 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
 
 ```python  
 
@@ -311,7 +311,7 @@ emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)
 ```
 
-- 3. 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
 
 ```python  
 hidden_0 = paddle.layer.mixed(
@@ -361,7 +361,7 @@ for i in range(1, depth):
     input_tmp = [mix_hidden, lstm]
 ```
 
-- 4. 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
+- 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
 
 ```python
 feature_out = paddle.layer.mixed(
@@ -375,7 +375,7 @@ input=[
 ], )
 ```
 
-- 5. 网络的末端定义CRF层计算损失(cost)，指定参数名字为 `crfw`，该层需要输入正确的数据标签(target)。
+- 网络的末端定义CRF层计算损失(cost)，指定参数名字为 `crfw`，该层需要输入正确的数据标签(target)。
 
 ```python
 crf_cost = paddle.layer.crf(
@@ -388,7 +388,7 @@ crf_cost = paddle.layer.crf(
         learning_rate=mix_hidden_lr))
 ```
 
-- 6. CRF译码层和CRF层参数名字相同，即共享权重。如果输入了正确的数据标签(target)，会统计错误标签的个数，可以用来评估模型。如果没有输入正确的数据标签，该层可以推到出最优解，可以用来预测模型。
+- CRF译码层和CRF层参数名字相同，即共享权重。如果输入了正确的数据标签(target)，会统计错误标签的个数，可以用来评估模型。如果没有输入正确的数据标签，该层可以推到出最优解，可以用来预测模型。
 
 ```python
 crf_dec = paddle.layer.crf_decoding(
diff --git a/machine_translation/README.en.md b/machine_translation/README.en.md
index 8a7f1098182b6443b58c22360da2fcdfd3439444..3713ec58c1800919a4718bee711b1fa8d358f7ec 100644
--- a/machine_translation/README.en.md
+++ b/machine_translation/README.en.md
@@ -446,6 +446,7 @@ settings(
 This tutorial will use the default SGD and Adam learning algorithm, with a learning rate of 5e-4. Note that the `batch_size = 50` denotes generating 50 sequence each time.
 
 ### Model Structure
+
 1. Define some global variables
 
    ```python
@@ -493,6 +494,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    with mixed_layer(size=decoder_size) as encoded_proj:
        encoded_proj += full_matrix_projection(input=encoded_vector)
    ```
+
    3.2 Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
 
    ```python
@@ -502,6 +504,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            act=TanhActivation(), ) as decoder_boot:
        decoder_boot += full_matrix_projection(input=backward_first)
    ```
+
    3.3 Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
 
       - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
@@ -536,6 +539,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            out += full_matrix_projection(input=gru_step)
        return out
     ```
+
 4. Decoder differences between the training and generation
 
    4.1 Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
@@ -546,6 +550,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]
    ```
+
    4.2 In training mode:
 
       - word embedding from the target langauge trg_embedding is passed to `gru_decoder_with_attention` as current_word.
@@ -571,6 +576,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
        cost = classification_cost(input=decoder, label=lbl)
        outputs(cost)
    ```
+
    4.3 In generation mode:
 
       - during generation, as the decoder RNN will take the word vector generated from the previous time step as input, `GeneratedInput` is used to implement this automatically. Please refer to [GeneratedInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for details.
diff --git a/machine_translation/README.md b/machine_translation/README.md
index 3eec2b68c9bc3cc14e3544b1aac6c71c4265b2ff..d1902a27fefd6b2df8f1b88bcd0c082a5c176b7a 100644
--- a/machine_translation/README.md
+++ b/machine_translation/README.md
@@ -340,6 +340,7 @@ wmt14_reader = paddle.batch(
             out += paddle.layer.full_matrix_projection(input=gru_step)
         return out
     ```
+
 4. 训练模式与生成模式下的解码器调用区别。
 
    4.1 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
@@ -400,6 +401,7 @@ for param in parameters.keys():
 ```
 
 ### 训练模型
+
 1. 构造trainer
 
     根据优化目标cost,网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法。
@@ -409,7 +411,7 @@ for param in parameters.keys():
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer)
-```
+    ```
 
 2. 构造event_handler
 
@@ -421,6 +423,7 @@ for param in parameters.keys():
                 print "Pass %d, Batch %d, Cost %f, %s" % (
                     event.pass_id, event.batch_id, event.cost, event.metrics)
     ```
+
 3. 启动训练：
 
     ```python
@@ -435,7 +438,7 @@ for param in parameters.keys():
     Pass 0, Batch 0, Cost 247.408008, {'classification_error_evaluator': 1.0}
     Pass 0, Batch 10, Cost 212.058789, {'classification_error_evaluator': 0.8737863898277283}
     ...
-```
+    ```
 
 
 ## 应用模型
diff --git a/machine_translation/index.en.html b/machine_translation/index.en.html
index 06da2ebaa751aaa1eff265005a5c31d978a2f243..bafae0eb0c35fbd5b28b3c8900a9cfce41ec3893 100644
--- a/machine_translation/index.en.html
+++ b/machine_translation/index.en.html
@@ -488,6 +488,7 @@ settings(
 This tutorial will use the default SGD and Adam learning algorithm, with a learning rate of 5e-4. Note that the `batch_size = 50` denotes generating 50 sequence each time.
 
 ### Model Structure
+
 1. Define some global variables
 
    ```python
@@ -535,6 +536,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    with mixed_layer(size=decoder_size) as encoded_proj:
        encoded_proj += full_matrix_projection(input=encoded_vector)
    ```
+
    3.2 Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
 
    ```python
@@ -544,6 +546,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            act=TanhActivation(), ) as decoder_boot:
        decoder_boot += full_matrix_projection(input=backward_first)
    ```
+
    3.3 Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
 
       - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
@@ -578,6 +581,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
            out += full_matrix_projection(input=gru_step)
        return out
     ```
+
 4. Decoder differences between the training and generation
 
    4.1 Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
@@ -588,6 +592,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
    group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]
    ```
+
    4.2 In training mode:
 
       - word embedding from the target langauge trg_embedding is passed to `gru_decoder_with_attention` as current_word.
@@ -613,6 +618,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
        cost = classification_cost(input=decoder, label=lbl)
        outputs(cost)
    ```
+
    4.3 In generation mode:
 
       - during generation, as the decoder RNN will take the word vector generated from the previous time step as input, `GeneratedInput` is used to implement this automatically. Please refer to [GeneratedInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for details.
diff --git a/machine_translation/index.html b/machine_translation/index.html
index 78a7ea99b9a3d884040b5671f49136b48a7852a5..a960a2cc7f948d9c76721b9f05a0492cbc149914 100644
--- a/machine_translation/index.html
+++ b/machine_translation/index.html
@@ -382,6 +382,7 @@ wmt14_reader = paddle.batch(
             out += paddle.layer.full_matrix_projection(input=gru_step)
         return out
     ```
+
 4. 训练模式与生成模式下的解码器调用区别。
 
    4.1 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
@@ -442,6 +443,7 @@ for param in parameters.keys():
 ```
 
 ### 训练模型
+
 1. 构造trainer
 
     根据优化目标cost,网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法。
@@ -451,7 +453,7 @@ for param in parameters.keys():
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer)
-```
+    ```
 
 2. 构造event_handler
 
@@ -463,6 +465,7 @@ for param in parameters.keys():
                 print "Pass %d, Batch %d, Cost %f, %s" % (
                     event.pass_id, event.batch_id, event.cost, event.metrics)
     ```
+
 3. 启动训练：
 
     ```python
@@ -477,7 +480,7 @@ for param in parameters.keys():
     Pass 0, Batch 0, Cost 247.408008, {'classification_error_evaluator': 1.0}
     Pass 0, Batch 10, Cost 212.058789, {'classification_error_evaluator': 0.8737863898277283}
     ...
-```
+    ```
 
 
 ## 应用模型