diff --git a/08.machine_translation/README.cn.md b/08.machine_translation/README.cn.md
index 266bd6cacfe560524f0d2e0b0e7dd26649e32a6b..ac9b932464227fc277e65544ee90da770afe1b86 100644
--- a/08.machine_translation/README.cn.md
+++ b/08.machine_translation/README.cn.md
@@ -228,19 +228,19 @@ is_generating = False
    - 对源语言序列编码后的结果（见2的最后一步），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。
 
    ```python
-    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
-        encoded_proj += paddle.layer.full_matrix_projection(
-            input=encoded_vector)
+   encoded_proj = paddle.layer.mixed(
+       size=decoder_size,
+       input=paddle.layer.full_matrix_projection(encoded_vector))
    ```
 
    - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。
 
    ```python
-    backward_first = paddle.layer.first_seq(input=src_backward)
-    with paddle.layer.mixed(
-            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
-        decoder_boot += paddle.layer.full_matrix_projection(
-            input=backward_first)
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
    ```
 
    - 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
@@ -251,8 +251,7 @@ is_generating = False
       - 最后，使用softmax归一化计算单词的概率，将out结果返回，即实现公式$p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$。
 
    ```python
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         decoder_mem = paddle.layer.memory(
             name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
 
@@ -261,10 +260,12 @@ is_generating = False
             encoded_proj=enc_proj,
             decoder_state=decoder_mem)
 
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
 
         gru_step = paddle.layer.gru_step(
             name='gru_decoder',
@@ -272,20 +273,20 @@ is_generating = False
             output_mem=decoder_mem,
             size=decoder_size)
 
-        with paddle.layer.mixed(
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
-    ```
+   ```
 
 4. 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
 
     ```python
     decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
     group_inputs = [group_input1, group_input2]
     ```
 
@@ -338,7 +339,7 @@ is_generating = False
        # GeneratedInputs, which is initialized by a start mark, such as <s>,
        # and must be included in generation.
 
-       trg_embedding = paddle.layer.GeneratedInputV2(
+       trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
diff --git a/08.machine_translation/README.md b/08.machine_translation/README.md
index d6e79fadaeb86f88d7636e28f2a738e618048367..227492ac4a80ed67b2ec2236a6372cab5ba51e4b 100644
--- a/08.machine_translation/README.md
+++ b/08.machine_translation/README.md
@@ -265,19 +265,19 @@ is_generating = False
    - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network
 
    ```python
-    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
-        encoded_proj += paddle.layer.full_matrix_projection(
-            input=encoded_vector)
+   encoded_proj = paddle.layer.mixed(
+       size=decoder_size,
+       input=paddle.layer.full_matrix_projection(encoded_vector))
    ```
 
    - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
 
    ```python
-    backward_first = paddle.layer.first_seq(input=src_backward)
-    with paddle.layer.mixed(
-            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
-        decoder_boot += paddle.layer.full_matrix_projection(
-            input=backward_first)
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
    ```
 
    - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
@@ -289,8 +289,7 @@ is_generating = False
       - Softmax normalization is used in the end to computed the probability of words, i.e., $p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$. The output is returned.
 
    ```python
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         decoder_mem = paddle.layer.memory(
             name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
 
@@ -299,10 +298,12 @@ is_generating = False
             encoded_proj=enc_proj,
             decoder_state=decoder_mem)
 
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
 
         gru_step = paddle.layer.gru_step(
             name='gru_decoder',
@@ -310,20 +311,20 @@ is_generating = False
             output_mem=decoder_mem,
             size=decoder_size)
 
-        with paddle.layer.mixed(
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
-    ```
+   ```
 
 4. Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
 
     ```python
     decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
     group_inputs = [group_input1, group_input2]
     ```
 
@@ -376,7 +377,7 @@ is_generating = False
        # GeneratedInputs, which is initialized by a start mark, such as <s>,
        # and must be included in generation.
 
-       trg_embedding = paddle.layer.GeneratedInputV2(
+       trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
diff --git a/08.machine_translation/index.cn.html b/08.machine_translation/index.cn.html
index 9ac62e42f65fa0bd55d650680caeb25a08b8389d..43daf9b4181ae419f3265eba0cbcacea0fb8d6f9 100644
--- a/08.machine_translation/index.cn.html
+++ b/08.machine_translation/index.cn.html
@@ -270,19 +270,19 @@ is_generating = False
    - 对源语言序列编码后的结果（见2的最后一步），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。
 
    ```python
-    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
-        encoded_proj += paddle.layer.full_matrix_projection(
-            input=encoded_vector)
+   encoded_proj = paddle.layer.mixed(
+       size=decoder_size,
+       input=paddle.layer.full_matrix_projection(encoded_vector))
    ```
 
    - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。
 
    ```python
-    backward_first = paddle.layer.first_seq(input=src_backward)
-    with paddle.layer.mixed(
-            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
-        decoder_boot += paddle.layer.full_matrix_projection(
-            input=backward_first)
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
    ```
 
    - 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
@@ -293,8 +293,7 @@ is_generating = False
       - 最后，使用softmax归一化计算单词的概率，将out结果返回，即实现公式$p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$。
 
    ```python
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         decoder_mem = paddle.layer.memory(
             name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
 
@@ -303,10 +302,12 @@ is_generating = False
             encoded_proj=enc_proj,
             decoder_state=decoder_mem)
 
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
 
         gru_step = paddle.layer.gru_step(
             name='gru_decoder',
@@ -314,20 +315,20 @@ is_generating = False
             output_mem=decoder_mem,
             size=decoder_size)
 
-        with paddle.layer.mixed(
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
-    ```
+   ```
 
 4. 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
 
     ```python
     decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
     group_inputs = [group_input1, group_input2]
     ```
 
@@ -380,7 +381,7 @@ is_generating = False
        # GeneratedInputs, which is initialized by a start mark, such as <s>,
        # and must be included in generation.
 
-       trg_embedding = paddle.layer.GeneratedInputV2(
+       trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
diff --git a/08.machine_translation/index.html b/08.machine_translation/index.html
index ecc7a6571160f04143020f22193964a5b5f9f7ac..5d58c9dd93e4cd85c918948da1540a7d3aaa2e84 100644
--- a/08.machine_translation/index.html
+++ b/08.machine_translation/index.html
@@ -307,19 +307,19 @@ is_generating = False
    - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network
 
    ```python
-    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
-        encoded_proj += paddle.layer.full_matrix_projection(
-            input=encoded_vector)
+   encoded_proj = paddle.layer.mixed(
+       size=decoder_size,
+       input=paddle.layer.full_matrix_projection(encoded_vector))
    ```
 
    - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
 
    ```python
-    backward_first = paddle.layer.first_seq(input=src_backward)
-    with paddle.layer.mixed(
-            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
-        decoder_boot += paddle.layer.full_matrix_projection(
-            input=backward_first)
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
    ```
 
    - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
@@ -331,8 +331,7 @@ is_generating = False
       - Softmax normalization is used in the end to computed the probability of words, i.e., $p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$. The output is returned.
 
    ```python
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         decoder_mem = paddle.layer.memory(
             name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
 
@@ -341,10 +340,12 @@ is_generating = False
             encoded_proj=enc_proj,
             decoder_state=decoder_mem)
 
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
 
         gru_step = paddle.layer.gru_step(
             name='gru_decoder',
@@ -352,20 +353,20 @@ is_generating = False
             output_mem=decoder_mem,
             size=decoder_size)
 
-        with paddle.layer.mixed(
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
-    ```
+   ```
 
 4. Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
 
     ```python
     decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
     group_inputs = [group_input1, group_input2]
     ```
 
@@ -418,7 +419,7 @@ is_generating = False
        # GeneratedInputs, which is initialized by a start mark, such as <s>,
        # and must be included in generation.
 
-       trg_embedding = paddle.layer.GeneratedInputV2(
+       trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
diff --git a/08.machine_translation/train.py b/08.machine_translation/train.py
index a2865b7b4f8390936d76eb0d9c6c43944908294e..a1394366f21ab55e81ab7494013a077ca77fbeef 100644
--- a/08.machine_translation/train.py
+++ b/08.machine_translation/train.py
@@ -27,16 +27,16 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
     encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
 
     #### Decoder
-    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
-        encoded_proj += paddle.layer.full_matrix_projection(
-            input=encoded_vector)
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
 
     backward_first = paddle.layer.first_seq(input=src_backward)
 
-    with paddle.layer.mixed(
-            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
-        decoder_boot += paddle.layer.full_matrix_projection(
-            input=backward_first)
+    decoder_boot = paddle.layer.mixed(
+        size=decoder_size,
+        act=paddle.activation.Tanh(),
+        input=paddle.layer.full_matrix_projection(backward_first))
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
 
@@ -48,10 +48,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
             encoded_proj=enc_proj,
             decoder_state=decoder_mem)
 
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
 
         gru_step = paddle.layer.gru_step(
             name='gru_decoder',
@@ -59,16 +61,16 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
             output_mem=decoder_mem,
             size=decoder_size)
 
-        with paddle.layer.mixed(
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
 
     decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
     group_inputs = [group_input1, group_input2]
 
     if not is_generating:
@@ -106,7 +108,7 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
         # GeneratedInputs, which is initialized by a start mark, such as <s>,
         # and must be included in generation.
 
-        trg_embedding = paddle.layer.GeneratedInputV2(
+        trg_embedding = paddle.layer.GeneratedInput(
             size=target_dict_dim,
             embedding_name='_target_language_embedding',
             embedding_size=word_vector_dim)
@@ -178,7 +180,7 @@ def main():
         beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
         # get the pretrained model, whose bleu = 26.92
         parameters = paddle.dataset.wmt14.model()
-        # prob is the prediction probabilities, and id is the prediction word. 
+        # prob is the prediction probabilities, and id is the prediction word.
         beam_result = paddle.infer(
             output_layer=beam_gen,
             parameters=parameters,