Synchronize to develop (#845)

* -1->None (#823) * unify paddle 1.6 api in understand_sentiment (#824) * unify paddle 1.6 api in understand_sentiment * Upgrade w2v & srl's api (#828) * Upgrade w2v & srl's api * Upgrade label semantic roles api * Rewrite 08.machine_translation using Paddle-1.6 apis. (#826) * Rewrite 08.machine_translation using Paddle-1.6 apis. * Delete the old train.py in 08.machine_translation * Update train.py to seq2seq.py in README_cn 08.machine_translation. * Fix the print content of seq2seq.py. * Update code format in README_cn of 08.machine_translation. * add 1.6 requirement (#830) * fix bugs in Readme.md of 04 and 07 (#831) (#832) * Polish optimizer name in 04.word2vec (#841) (#842) * change opt name in word2vec * update index.cn.html * Cherry pick for paddle 1.6 (#843) Co-authored-by: N ceci3 <ceci3@users.noreply.github.com> Co-authored-by: N Li Fuchen <lfchener@outlook.com> Co-authored-by: N Yibing Liu <liuyibing01@baidu.com> Co-authored-by: N Guo Sheng <whucsgs@163.com> Co-authored-by: N ruri <shipeng1108@163.com> Co-authored-by: N Chen Weihang <sunny_cwh@163.com>

Synchronize to develop (#845)
* -1->None (#823) * unify paddle 1.6 api in understand_sentiment (#824) * unify paddle 1.6 api in understand_sentiment * Upgrade w2v & srl's api (#828) * Upgrade w2v & srl's api * Upgrade label semantic roles api * Rewrite 08.machine_translation using Paddle-1.6 apis. (#826) * Rewrite 08.machine_translation using Paddle-1.6 apis. * Delete the old train.py in 08.machine_translation * Update train.py to seq2seq.py in README_cn 08.machine_translation. * Fix the print content of seq2seq.py. * Update code format in README_cn of 08.machine_translation. * add 1.6 requirement (#830) * fix bugs in Readme.md of 04 and 07 (#831) (#832) * Polish optimizer name in 04.word2vec (#841) (#842) * change opt name in word2vec * update index.cn.html * Cherry pick for paddle 1.6 (#843) Co-authored-by: N ceci3 <ceci3@users.noreply.github.com> Co-authored-by: N Li Fuchen <lfchener@outlook.com> Co-authored-by: N Yibing Liu <liuyibing01@baidu.com> Co-authored-by: N Guo Sheng <whucsgs@163.com> Co-authored-by: N ruri <shipeng1108@163.com> Co-authored-by: N Chen Weihang <sunny_cwh@163.com>
7d9dbc42 · xiaoting · ceci3 · a0510ad9 · 7d9dbc42 · 7d9dbc42
38 changed file
--- a/01.fit_a_line/README.cn.md
+++ b/01.fit_a_line/README.cn.md
@@ -194,8 +194,8 @@ test_reader = paddle.batch(
 训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲，它就是一个从输入到输出的简单的全连接层。更加复杂的结果，比如卷积神经网络，递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值，因为它会被后面反向传播算法所用到。

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # 定义输入的形状和数据类型
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # 定义输出的形状和数据类型
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # 定义输入的形状和数据类型
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # 定义输出的形状和数据类型
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # 连接输入和输出的全连接层

 main_program = fluid.default_main_program() # 获取默认/全局主函数

--- a/01.fit_a_line/README.md
+++ b/01.fit_a_line/README.md
@@ -196,8 +196,8 @@ test_reader = paddle.batch(
 The aim of the program for training is to define a network structure of a training model. For linear regression, it is a simple fully connected layer from input to output. More complex result, such as Convolutional Neural Network and Recurrent Neural Network, will be introduced in later chapters. It must return `mean error` as the first return value in program for training, for that `mean error` will be used for BackPropagation.

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # define shape and data type of input
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # define shape and data type of output
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # define shape and data type of input
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # define shape and data type of output
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # fully connected layer connecting input and output

 main_program = fluid.default_main_program() # get default/global main function

--- a/01.fit_a_line/index.cn.html
+++ b/01.fit_a_line/index.cn.html
@@ -236,8 +236,8 @@ test_reader = paddle.batch(
 训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲，它就是一个从输入到输出的简单的全连接层。更加复杂的结果，比如卷积神经网络，递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值，因为它会被后面反向传播算法所用到。

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # 定义输入的形状和数据类型
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # 定义输出的形状和数据类型
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # 定义输入的形状和数据类型
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # 定义输出的形状和数据类型
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # 连接输入和输出的全连接层

 main_program = fluid.default_main_program() # 获取默认/全局主函数

--- a/01.fit_a_line/index.html
+++ b/01.fit_a_line/index.html
@@ -238,8 +238,8 @@ test_reader = paddle.batch(
 The aim of the program for training is to define a network structure of a training model. For linear regression, it is a simple fully connected layer from input to output. More complex result, such as Convolutional Neural Network and Recurrent Neural Network, will be introduced in later chapters. It must return `mean error` as the first return value in program for training, for that `mean error` will be used for BackPropagation.

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # define shape and data type of input
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # define shape and data type of output
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # define shape and data type of input
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # define shape and data type of output
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # fully connected layer connecting input and output

 main_program = fluid.default_main_program() # get default/global main function

--- a/01.fit_a_line/train.py
+++ b/01.fit_a_line/train.py
@@ -87,8 +87,8 @@ def main():
            batch_size=batch_size)

    # feature vector of length 13
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+    y = fluid.data(name='y', shape=[None, 1], dtype='float32')

    main_program = fluid.default_main_program()
    startup_program = fluid.default_startup_program()

--- a/02.recognize_digits/README.cn.md
+++ b/02.recognize_digits/README.cn.md
@@ -209,7 +209,7 @@ def softmax_regression():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 以softmax为激活函数的全连接层，输出层的大小必须为数字的个数10
    predict = fluid.layers.fc(
        input=img, size=10, act='softmax')
@@ -229,7 +229,7 @@ def multilayer_perceptron():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个全连接层，激活函数为ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # 第二个全连接层，激活函数为ReLU
@@ -282,7 +282,7 @@ def convolutional_neural_network():
        predict -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个卷积-池化层
    # 使用20个5*5的滤波器，池化大小为2，池化步长为2，激活函数为Relu
    conv_pool_1 = conv_pool(
@@ -327,7 +327,7 @@ def train_program():

    """
    # 标签层，名称为label,对应输入图片的类别标签
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # 取消注释将使用 Softmax回归
    # predict = multilayer_perceptron() # 取消注释将使用 多层感知器

--- a/02.recognize_digits/README.md
+++ b/02.recognize_digits/README.md
@@ -188,7 +188,7 @@ def softmax_regression():
    predict_image -- result of classification
    """
    # input original image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # With softmax as the fully connected layer of the activation function, the size of the output layer must be 10
    predict = fluid.layers.fc(
    input=img, size=10, act='softmax')
@@ -208,7 +208,7 @@ def multilayer_perceptron():
    predict_image -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first fully connected layer, whose activation function is ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # the second fully connected layer, whose activation function is ReLU
@@ -260,7 +260,7 @@ def convolutional_neural_network():
    predict -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first convolution-pooling layer
    # Use 20 5*5 filters, the pooling size is 2, the pooling step is 2, and the activation function is Relu.
    conv_pool_1 = conv_pool(
@@ -305,7 +305,7 @@ def train_program():

    """
    # label layer, called label, correspondent with label category of input picture
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # cancel note and run Softmax regression
    # predict = multilayer_perceptron() # cancel note and run multiple perceptron

--- a/02.recognize_digits/index.cn.html
+++ b/02.recognize_digits/index.cn.html
@@ -251,7 +251,7 @@ def softmax_regression():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 以softmax为激活函数的全连接层，输出层的大小必须为数字的个数10
    predict = fluid.layers.fc(
        input=img, size=10, act='softmax')
@@ -271,7 +271,7 @@ def multilayer_perceptron():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个全连接层，激活函数为ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # 第二个全连接层，激活函数为ReLU
@@ -324,7 +324,7 @@ def convolutional_neural_network():
        predict -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个卷积-池化层
    # 使用20个5*5的滤波器，池化大小为2，池化步长为2，激活函数为Relu
    conv_pool_1 = conv_pool(
@@ -369,7 +369,7 @@ def train_program():

    """
    # 标签层，名称为label,对应输入图片的类别标签
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # 取消注释将使用 Softmax回归
    # predict = multilayer_perceptron() # 取消注释将使用 多层感知器

--- a/02.recognize_digits/index.html
+++ b/02.recognize_digits/index.html
@@ -230,7 +230,7 @@ def softmax_regression():
    predict_image -- result of classification
    """
    # input original image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # With softmax as the fully connected layer of the activation function, the size of the output layer must be 10
    predict = fluid.layers.fc(
    input=img, size=10, act='softmax')
@@ -250,7 +250,7 @@ def multilayer_perceptron():
    predict_image -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first fully connected layer, whose activation function is ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # the second fully connected layer, whose activation function is ReLU
@@ -302,7 +302,7 @@ def convolutional_neural_network():
    predict -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first convolution-pooling layer
    # Use 20 5*5 filters, the pooling size is 2, the pooling step is 2, and the activation function is Relu.
    conv_pool_1 = conv_pool(
@@ -347,7 +347,7 @@ def train_program():

    """
    # label layer, called label, correspondent with label category of input picture
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # cancel note and run Softmax regression
    # predict = multilayer_perceptron() # cancel note and run multiple perceptron

--- a/02.recognize_digits/train.py
+++ b/02.recognize_digits/train.py
@@ -101,8 +101,8 @@ def train(nn_type,
        test_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)

-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    if nn_type == 'softmax_regression':
        net_conf = softmax_regression

--- a/03.image_classification/README.cn.md
+++ b/03.image_classification/README.cn.md
@@ -14,6 +14,7 @@
 3.文档和脚本中代码的一致性问题：
 请注意：为使本文更加易读易用，我们拆分、调整了train.py的代码并放入本文。本文中代码与train.py的运行结果一致，可直接运行[train.py](https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/train.py)进行验证。

+4. PaddlePaddle版本：PaddlePaddle 1.6及以上版本或适当的develop版本。

 ## 背景介绍


--- a/03.image_classification/README.md
+++ b/03.image_classification/README.md
@@ -15,6 +15,10 @@ With Deep learning, image classification can be framed as a supervised or unsupe

 In this chapter, we introduce deep-learning-based image classification methods and explain how to train a CNN model using PaddlePaddle.

+## Requirement
+
+1. PaddlePaddle version 1.6 or higher, or suitable develop version.
+
 ## Result Demo

 Image Classification can be divided into general image classification and fine-grained image classification.

--- a/03.image_classification/index.cn.html
+++ b/03.image_classification/index.cn.html
@@ -56,6 +56,7 @@
 3.文档和脚本中代码的一致性问题：
 请注意：为使本文更加易读易用，我们拆分、调整了train.py的代码并放入本文。本文中代码与train.py的运行结果一致，可直接运行[train.py](https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/train.py)进行验证。

+4. PaddlePaddle版本：PaddlePaddle 1.6及以上版本或适当的develop版本。

 ## 背景介绍


--- a/03.image_classification/index.html
+++ b/03.image_classification/index.html
@@ -57,6 +57,10 @@ With Deep learning, image classification can be framed as a supervised or unsupe

 In this chapter, we introduce deep-learning-based image classification methods and explain how to train a CNN model using PaddlePaddle.

+## Requirement
+
+1. PaddlePaddle version 1.6 or higher, or suitable develop version.
+
 ## Result Demo

 Image Classification can be divided into general image classification and fine-grained image classification.

--- a/04.word2vec/README.cn.md
+++ b/04.word2vec/README.cn.md
@@ -262,32 +262,32 @@ dict_size = len(word_dict)
 ```

 更大的`BATCH_SIZE`将使得训练更快收敛，但也会消耗更多内存。由于词向量计算规模较大，如果环境允许，请开启使用GPU进行训练，能更快得到结果。
-不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`，我们就可以直接用它来构造 N-gram 神经网络。
+不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.embedding`，我们就可以直接用它来构造 N-gram 神经网络。

 - 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏，我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -310,7 +310,7 @@ def train_program(predict_word):
    # 'next_word'的定义必须要在inference_program的声明之后，
    # 否则train program输入数据的顺序就变成了[next_word, firstw, secondw,
    # thirdw, fourthw], 这是不正确的.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -335,11 +335,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/README.md
+++ b/04.word2vec/README.md
@@ -227,32 +227,32 @@ dict_size = len(word_dict)
 ```

 A larger `BATCH_SIZE` will make the training converge faster, but it will also consume more memory. Since the word vector calculation is large, if the environment allows, please turn on the GPU for training, and get results faster.
-Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.layers.embedding`, which we can use directly to construct an N-gram neural network.
+Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.embedding`, which we can use directly to construct an N-gram neural network.

 - Let's define our N-gram neural network structure. This structure is used in both training and predicting. Because the word vector is sparse, we pass the parameter `is_sparse == True` to speed up the update of the sparse matrix.

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -275,7 +275,7 @@ def train_program(predict_word):
    # The definition of'next_word' must be after the declaration of inference_program.
    # Otherwise the sequence of the train program input data becomes [next_word, firstw, secondw,
    #thirdw, fourthw], This is not true.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -300,11 +300,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/index.cn.html
+++ b/04.word2vec/index.cn.html
@@ -304,32 +304,32 @@ dict_size = len(word_dict)
 ```

 更大的`BATCH_SIZE`将使得训练更快收敛，但也会消耗更多内存。由于词向量计算规模较大，如果环境允许，请开启使用GPU进行训练，能更快得到结果。
-不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`，我们就可以直接用它来构造 N-gram 神经网络。
+不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.embedding`，我们就可以直接用它来构造 N-gram 神经网络。

 - 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏，我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -352,7 +352,7 @@ def train_program(predict_word):
    # 'next_word'的定义必须要在inference_program的声明之后，
    # 否则train program输入数据的顺序就变成了[next_word, firstw, secondw,
    # thirdw, fourthw], 这是不正确的.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -377,11 +377,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/index.html
+++ b/04.word2vec/index.html
@@ -269,32 +269,32 @@ dict_size = len(word_dict)
 ```

 A larger `BATCH_SIZE` will make the training converge faster, but it will also consume more memory. Since the word vector calculation is large, if the environment allows, please turn on the GPU for training, and get results faster.
-Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.layers.embedding`, which we can use directly to construct an N-gram neural network.
+Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.embedding`, which we can use directly to construct an N-gram neural network.

 - Let's define our N-gram neural network structure. This structure is used in both training and predicting. Because the word vector is sparse, we pass the parameter `is_sparse == True` to speed up the update of the sparse matrix.

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -317,7 +317,7 @@ def train_program(predict_word):
    # The definition of'next_word' must be after the declaration of inference_program.
    # Otherwise the sequence of the train program input data becomes [next_word, firstw, secondw,
    #thirdw, fourthw], This is not true.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -342,11 +342,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/train.py
+++ b/04.word2vec/train.py
@@ -45,25 +45,25 @@ def parse_args():

 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -82,7 +82,7 @@ def train_program(predict_word):
    # The declaration of 'next_word' must be after the invoking of inference_program,
    # or the data input order of train program would be [next_word, firstw, secondw,
    # thirdw, fourthw], which is not correct.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -102,11 +102,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/06.understand_sentiment/README.cn.md
+++ b/06.understand_sentiment/README.cn.md
@@ -218,9 +218,8 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -235,7 +234,7 @@ def inference_program(word_dict):

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/README.md
+++ b/06.understand_sentiment/README.md
@@ -207,9 +207,8 @@ Next we define the prediction program (`inference_program`). We use `convolution

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -224,7 +223,7 @@ During the testing, the classifier calculates the probability of each output. Th

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/index.cn.html
+++ b/06.understand_sentiment/index.cn.html
@@ -260,9 +260,8 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -277,7 +276,7 @@ def inference_program(word_dict):

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/index.html
+++ b/06.understand_sentiment/index.html
@@ -249,9 +249,8 @@ Next we define the prediction program (`inference_program`). We use `convolution

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -266,7 +265,7 @@ During the testing, the classifier calculates the probability of each output. Th

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/train_conv.py
+++ b/06.understand_sentiment/train_conv.py
@@ -62,16 +62,14 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):


 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
    dict_dim = len(word_dict)
+    data = fluid.data(name="words", shape=[None], dtype="int64", lod_level=1)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    return net


 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/train_dyn_rnn.py
+++ b/06.understand_sentiment/train_dyn_rnn.py
@@ -54,16 +54,14 @@ def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size):


 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE)
    return pred


 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/train_stacked_lstm.py
+++ b/06.understand_sentiment/train_stacked_lstm.py
@@ -69,9 +69,7 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):


 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
                           STACKED_NUM)
@@ -80,7 +78,7 @@ def inference_program(word_dict):

 def train_program(prediction):
    # prediction = inference_program(word_dict)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/07.label_semantic_roles/README.cn.md
+++ b/07.label_semantic_roles/README.cn.md
@@ -270,42 +270,42 @@ is_local = True

 ```python
 # 句子序列
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下文5个特征
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下区域标志
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### 定义网络结构
 首先预训练并定义模型输入层

 ```python
 #预训练谓词和谓词上下区域标志
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -316,7 +316,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 # 因词向量是预训练好的，这里不再训练embedding表，
 # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -374,8 +374,8 @@ feature_out = fluid.layers.sums(input=[
 ])

 # 标注序列
-target = fluid.layers.data(
-    name='target', shape=[1], dtype='int64', lod_level=1)
+target = fluid.data(
+    name='target', shape=[None, 1], dtype='int64', lod_level=1)

 # 学习 CRF 的转移特征
 crf_cost = fluid.layers.linear_chain_crf(

--- a/07.label_semantic_roles/README.md
+++ b/07.label_semantic_roles/README.md
@@ -252,42 +252,42 @@ Defines the format of the model input features, including the sentence sequence,

 ```python
 # Sentence sequences
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate context's 5 features
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # Predicate conotext area flag
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### Defining the network structure
 First pre-train and define the model input layer

 ```python
 #pre-training predicate and predicate context area flags
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -298,7 +298,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 #Because word vector is pre-trained, no longer training embedding table,
 # The trainable's parameter attribute set to False prevents the embedding table from being updated during training
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -356,7 +356,7 @@ feature_out = fluid.layers.sums(input=[
 ])

 # tag/label sequence
-target = fluid.layers.data(
+target = fluid.data(
    name='target', shape=[1], dtype='int64', lod_level=1)

 # Learning CRF transfer features

--- a/07.label_semantic_roles/index.cn.html
+++ b/07.label_semantic_roles/index.cn.html
@@ -312,42 +312,42 @@ is_local = True

 ```python
 # 句子序列
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下文5个特征
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下区域标志
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### 定义网络结构
 首先预训练并定义模型输入层

 ```python
 #预训练谓词和谓词上下区域标志
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -358,7 +358,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 # 因词向量是预训练好的，这里不再训练embedding表，
 # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -416,8 +416,8 @@ feature_out = fluid.layers.sums(input=[
 ])

 # 标注序列
-target = fluid.layers.data(
-    name='target', shape=[1], dtype='int64', lod_level=1)
+target = fluid.data(
+    name='target', shape=[None, 1], dtype='int64', lod_level=1)

 # 学习 CRF 的转移特征
 crf_cost = fluid.layers.linear_chain_crf(

--- a/07.label_semantic_roles/index.html
+++ b/07.label_semantic_roles/index.html
@@ -294,42 +294,42 @@ Defines the format of the model input features, including the sentence sequence,

 ```python
 # Sentence sequences
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate context's 5 features
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # Predicate conotext area flag
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### Defining the network structure
 First pre-train and define the model input layer

 ```python
 #pre-training predicate and predicate context area flags
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -340,7 +340,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 #Because word vector is pre-trained, no longer training embedding table,
 # The trainable's parameter attribute set to False prevents the embedding table from being updated during training
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -398,7 +398,7 @@ feature_out = fluid.layers.sums(input=[
 ])

 # tag/label sequence
-target = fluid.layers.data(
+target = fluid.data(
    name='target', shape=[1], dtype='int64', lod_level=1)

 # Learning CRF transfer features

--- a/07.label_semantic_roles/train.py
+++ b/07.label_semantic_roles/train.py
@@ -53,14 +53,14 @@ def load_parameter(file_name, h, w):
 def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
            **ignored):
    # 8 features
-    predicate_embedding = fluid.layers.embedding(
+    predicate_embedding = fluid.embedding(
        input=predicate,
        size=[pred_dict_len, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr='vemb')

-    mark_embedding = fluid.layers.embedding(
+    mark_embedding = fluid.embedding(
        input=mark,
        size=[mark_dict_len, mark_dim],
        dtype='float32',
@@ -68,7 +68,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,

    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
    emb_layers = [
-        fluid.layers.embedding(
+        fluid.embedding(
            size=[word_dict_len, word_dim],
            input=x,
            param_attr=fluid.ParamAttr(name=embedding_name, trainable=False))
@@ -120,22 +120,22 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,

 def train(use_cuda, save_dirname=None, is_local=True):
    # define data layers
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    word = fluid.data(
+        name='word_data', shape=[None, 1], dtype='int64', lod_level=1)
+    predicate = fluid.data(
+        name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.data(
+        name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.data(
+        name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.data(
+        name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.data(
+        name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.data(
+        name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)
+    mark = fluid.data(
+        name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)

    if args.enable_ce:
        fluid.default_startup_program().random_seed = 90

--- a/08.machine_translation/README.cn.md
+++ b/08.machine_translation/README.cn.md
@@ -5,7 +5,7 @@
 ### 说明
 1. 硬件要求 本文可支持在CPU、GPU下运行
 2. 对docker file cuda/cudnn的支持 如果您使用了本文配套的docker镜像，请注意：该镜像对GPU的支持仅限于CUDA 8，cuDNN 5
-3. 文档中代码和train.py不一致的问题 请注意：为使本文更加易读易用，我们拆分、调整了train.py的代码并放入本文。本文中代码与train.py的运行结果一致，如希望直接看到训练脚本输出效果，可运行[train.py](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py)。
+3. 文档中代码和seq2seq.py不一致的问题 请注意：为使本文更加易读易用，我们拆分、调整了seq2seq.py的代码并放入本文。本文中代码与seq2seq.py的运行结果一致，如希望直接看到训练脚本输出效果，可运行[seq2seq.py](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/seq2seq.py)。

 ## 背景介绍

@@ -197,10 +197,14 @@ from __future__ import print_function
 import os
 import six

+import numpy as np
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers

 dict_size = 30000  # 词典大小
+bos_id = 0  # 词典中start token对应的id
+eos_id = 1  # 词典中end token对应的id
 source_dict_size = target_dict_size = dict_size  # 源/目标语言字典大小
 word_dim = 512  # 词向量维度
 hidden_dim = 512  # 编码器中的隐层大小
@@ -209,123 +213,226 @@ max_length = 256  # 解码生成句子的最大长度
 beam_size = 4  # beam search的柱宽度
 batch_size = 64  # batch 中的样本数

-is_sparse = True
 model_save_dir = "machine_translation.inference.model"
 ```

-然后如下实现编码器框架：
+接着定义所需要的数据输入：

 ```python
-def encoder():
-    # 定义源语言id序列的输入数据
-    src_word_id = fluid.layers.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    # 将上述编码映射到低维语言空间的词向量
-    src_embedding = fluid.layers.embedding(
-        input=src_word_id,
-        size=[source_dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse)
-    # 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到h
-    fc_forward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_forward = fluid.layers.dynamic_gru(input=fc_forward, size=hidden_dim)
-    fc_backward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_backward = fluid.layers.dynamic_gru(
-        input=fc_backward, size=hidden_dim, is_reverse=True)
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward, src_backward], axis=1)
-    return encoded_vector
+def data_func(is_train=True):
+    # 源语言source数据
+    src = fluid.data(name="src", shape=[None, None], dtype="int64")
+    src_sequence_length = fluid.data(name="src_sequence_length",
+                                     shape=[None],
+                                     dtype="int64")
+    inputs = [src, src_sequence_length]
+    # 训练时还需要目标语言target和label数据
+    if is_train:
+        trg = fluid.data(name="trg", shape=[None, None], dtype="int64")
+        trg_sequence_length = fluid.data(name="trg_sequence_length",
+                                        shape=[None],
+                                        dtype="int64")
+        label = fluid.data(name="label", shape=[None, None], dtype="int64")
+        inputs += [trg, trg_sequence_length, label]
+    # data loader
+    loader = fluid.io.DataLoader.from_generator(feed_list=inputs,
+                                                capacity=10,
+                                                iterable=True,
+                                                use_double_buffer=True)
+    return inputs, loader
+```
+
+然后如下实现使用双向GRU的编码器：
+
+```python
+def encoder(src_embedding, src_sequence_length):
+    # 使用GRUCell构建前向RNN
+    encoder_fwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_fwd_output, fwd_state = layers.rnn(
+        cell=encoder_fwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=False)
+    # 使用GRUCell构建反向RNN
+    encoder_bwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_bwd_output, bwd_state = layers.rnn(
+        cell=encoder_bwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=True)
+    # 拼接前向与反向GRU的编码结果得到h
+    encoder_output = layers.concat(
+        input=[encoder_fwd_output, encoder_bwd_output], axis=2)
+    encoder_state = layers.concat(input=[fwd_state, bwd_state], axis=1)
+    return encoder_output, encoder_state
 ```

 再实现基于注意力机制的解码器：
-  - 首先定义解码器中单步的计算，即$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$，如下：
+  - 首先通过 Cell 定义解码器中单步的计算，即$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$，这里使用 GRU 并加上注意力机制（Additive Attention），代码如下：

    ```python
-    # 定义RNN中的单步计算
-    def cell(x, hidden, encoder_out, encoder_out_proj):
+    class DecoderCell(layers.RNNCell):
+        def __init__(self, hidden_size):
+            self.hidden_size = hidden_size
+            self.gru_cell = layers.GRUCell(hidden_size)
+
+        def attention(self, hidden, encoder_output, encoder_output_proj,
+                      encoder_padding_mask):
            # 定义attention用以计算context，即 c_i，这里使用Bahdanau attention机制
-        def simple_attention(encoder_vec, encoder_proj, decoder_state):
-            decoder_state_proj = fluid.layers.fc(
-                input=decoder_state, size=decoder_size, bias_attr=False)
-            # sequence_expand将单步内容扩展为与encoder输出相同的序列
-            decoder_state_expand = fluid.layers.sequence_expand(
-                x=decoder_state_proj, y=encoder_proj)
-            mixed_state = fluid.layers.elementwise_add(encoder_proj,
-                                                    decoder_state_expand)
-            attention_weights = fluid.layers.fc(
-                input=mixed_state, size=1, bias_attr=False)
-            attention_weights = fluid.layers.sequence_softmax(
-                input=attention_weights)
-            weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
-            scaled = fluid.layers.elementwise_mul(
-                x=encoder_vec, y=weigths_reshape, axis=0)
-            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            decoder_state_proj = layers.unsqueeze(
+                layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1])
+            mixed_state = fluid.layers.elementwise_add(
+                encoder_output_proj,
+                layers.expand(decoder_state_proj,
+                            [1, layers.shape(decoder_state_proj)[1], 1]))
+            attn_scores = layers.squeeze(
+                layers.fc(input=mixed_state,
+                        size=1,
+                        num_flatten_dims=2,
+                        bias_attr=False), [2])
+            if encoder_padding_mask is not None:
+                attn_scores = layers.elementwise_add(attn_scores,
+                                                    encoder_padding_mask)
+            attn_scores = layers.softmax(attn_scores)
+            context = layers.reduce_sum(layers.elementwise_mul(encoder_output,
+                                                            attn_scores,
+                                                            axis=0),
+                                        dim=1)
            return context

-        context = simple_attention(encoder_out, encoder_out_proj, hidden)
-        out = fluid.layers.fc(
-            input=[x, context], size=decoder_size * 3, bias_attr=False)
-        out = fluid.layers.gru_unit(
-            input=out, hidden=hidden, size=decoder_size * 3)[0]
-        return out, out
+        def call(self,
+                 step_input,
+                 hidden,
+                 encoder_output,
+                 encoder_output_proj,
+                 encoder_padding_mask=None):
+            # Bahdanau attention
+            context = self.attention(hidden, encoder_output, encoder_output_proj,
+                                    encoder_padding_mask)
+            step_input = layers.concat([step_input, context], axis=1)
+            # GRU
+            output, new_hidden = self.gru_cell(step_input, hidden)
+            return output, new_hidden
    ```

-  - 基于定义的单步计算，使用`DynamicRNN`实现多步循环的训练模式下解码器，如下：
+  - 基于定义的单步计算，使用 `fluid.layers.rnn` 和 `fluid.layers.dynamic_decode` 分别实现用于训练和预测生成的多步循环解码器，如下：

    ```python
-    def train_decoder(encoder_out):
-        # 获取编码器输出的最后一步并进行非线性映射以构造解码器RNN的初始状态
-        encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-        encoder_last_proj = fluid.layers.fc(
-            input=encoder_last, size=decoder_size, act='tanh')
-        # 编码器输出在attention中计算结果的cache
-        encoder_out_proj = fluid.layers.fc(
-            input=encoder_out, size=decoder_size, bias_attr=False)
-
-        # 定义目标语言id序列的输入数据，并映射到低维语言空间的词向量
-        trg_language_word = fluid.layers.data(
-            name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-        trg_embedding = fluid.layers.embedding(
-            input=trg_language_word,
-            size=[target_dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-
-        rnn = fluid.layers.DynamicRNN()
-        with rnn.block():
-            # 获取当前步目标语言输入的词向量
-            x = rnn.step_input(trg_embedding)
-            # 获取隐层状态
-            pre_state = rnn.memory(init=encoder_last_proj, need_reorder=True)
-            # 在DynamicRNN中需使用static_input获取encoder相关的内容
-            # 对decoder来说这些内容在每个时间步都是固定的
-            encoder_out = rnn.static_input(encoder_out)
-            encoder_out_proj = rnn.static_input(encoder_out_proj)
-            # 执行单步的计算单元
-            out, current_state = cell(x, pre_state, encoder_out, encoder_out_proj)
-            # 计算归一化的单词预测概率
-            prob = fluid.layers.fc(input=out, size=target_dict_size, act='softmax')
-            # 更新隐层状态
-            rnn.update_memory(pre_state, current_state)
-            # 输出预测概率
-            rnn.output(prob)
-
-        return rnn()
+    def decoder(encoder_output,
+                encoder_output_proj,
+                encoder_state,
+                encoder_padding_mask,
+                trg=None,
+                is_train=True):
+        # 定义 RNN 所需要的组件
+        decoder_cell = DecoderCell(hidden_size=decoder_size)
+        decoder_initial_states = layers.fc(encoder_state,
+                                        size=decoder_size,
+                                        act="tanh")
+        trg_embeder = lambda x: fluid.embedding(input=x,
+                                                size=[target_dict_size, hidden_dim],
+                                                dtype="float32",
+                                                param_attr=fluid.ParamAttr(
+                                                    name="trg_emb_table"))
+        output_layer = lambda x: layers.fc(x,
+                                           size=target_dict_size,
+                                           num_flatten_dims=len(x.shape) - 1,
+                                           param_attr=fluid.ParamAttr(name=
+                                                                      "output_w"))
+        if is_train:  # 训练
+            # 训练时使用 `layers.rnn` 构造由 `cell` 指定的循环神经网络
+            # 循环的每一步从 `inputs` 中切片产生输入，并执行 `cell.call`
+            decoder_output, _ = layers.rnn(
+                cell=decoder_cell,
+                inputs=trg_embeder(trg),
+                initial_states=decoder_initial_states,
+                time_major=False,
+                encoder_output=encoder_output,
+                encoder_output_proj=encoder_output_proj,
+                encoder_padding_mask=encoder_padding_mask)
+            decoder_output = output_layer(decoder_output)
+        else:  # 基于 beam search 的预测生成
+            # beam search 时需要将用到的形为 `[batch_size, ...]` 的张量扩展为 `[batch_size* beam_size, ...]`
+            encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_output, beam_size)
+            encoder_output_proj = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_output_proj, beam_size)
+            encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_padding_mask, beam_size)
+            # BeamSearchDecoder 定义了单步解码的操作：`cell.call` + `beam_search_step`
+            beam_search_decoder = layers.BeamSearchDecoder(cell=decoder_cell,
+                                                           start_token=bos_id,
+                                                           end_token=eos_id,
+                                                           beam_size=beam_size,
+                                                           embedding_fn=trg_embeder,
+                                                           output_fn=output_layer)
+            # 使用 layers.dynamic_decode 动态解码
+            # 重复执行 `decoder.step()` 直到其返回的表示完成状态的张量中的值全部为True或解码步骤达到 `max_step_num`
+            decoder_output, _ = layers.dynamic_decode(
+                decoder=beam_search_decoder,
+                inits=decoder_initial_states,
+                max_step_num=max_length,
+                output_time_major=False,
+                encoder_output=encoder_output,
+                encoder_output_proj=encoder_output_proj,
+                encoder_padding_mask=encoder_padding_mask)
+
+        return decoder_output
    ```

-接着就可以使用编码器和解码器定义整个训练网络；为了进行训练还需要定义优化器，如下：
+接着就可以使用编码器和解码器定义整个网络，如下：
+
+```python
+def model_func(inputs, is_train=True):
+    # 源语言输入
+    src = inputs[0]
+    src_sequence_length = inputs[1]
+    src_embeder = lambda x: fluid.embedding(
+        input=x,
+        size=[source_dict_size, hidden_dim],
+        dtype="float32",
+        param_attr=fluid.ParamAttr(name="src_emb_table"))
+    src_embedding = src_embeder(src)
+
+    # 编码器
+    encoder_output, encoder_state = encoder(src_embedding, src_sequence_length)
+
+    encoder_output_proj = layers.fc(input=encoder_output,
+                                    size=decoder_size,
+                                    num_flatten_dims=2,
+                                    bias_attr=False)
+    src_mask = layers.sequence_mask(src_sequence_length,
+                                    maxlen=layers.shape(src)[1],
+                                    dtype="float32")
+    encoder_padding_mask = (src_mask - 1.0) * 1e9
+
+    # 目标语言输入，训练时有、预测生成时无该输入
+    trg = inputs[2] if is_train else None
+
+    # 解码器
+    output = decoder(encoder_output=encoder_output,
+                     encoder_output_proj=encoder_output_proj,
+                     encoder_state=encoder_state,
+                     encoder_padding_mask=encoder_padding_mask,
+                     trg=trg,
+                     is_train=is_train)
+    return output
+```
+
+为了进行训练还需要定义损失函数和优化器，如下：

 ```python
-def train_model():
-    encoder_out = encoder()
-    rnn_out = train_decoder(encoder_out)
-    label = fluid.layers.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    # 定义损失函数
-    cost = fluid.layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
+def loss_func(logits, label, trg_sequence_length):
+    probs = layers.softmax(logits)
+    # 使用交叉熵损失函数
+    loss = layers.cross_entropy(input=probs, label=label)
+    # 根据长度生成掩码，并依此剔除 padding 部分计算的损失
+    trg_mask = layers.sequence_mask(trg_sequence_length,
+                                    maxlen=layers.shape(logits)[1],
+                                    dtype="float32")
+    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
    return avg_cost

 def optimizer_func():
@@ -340,102 +447,45 @@ def optimizer_func():
            regularization_coeff=1e-4))
 ```

-以上是训练所需的模型构件，预测（生成）模式下基于beam search的解码器需要借助`while_op`实现，如下：
+## 训练模型

-```python
-def infer_decoder(encoder_out):
-    # 获取编码器输出的最后一步并进行非线性映射以构造解码器RNN的初始状态
-    encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-    encoder_last_proj = fluid.layers.fc(
-        input=encoder_last, size=decoder_size, act='tanh')
-    # 编码器输出在attention中计算结果的cache
-    encoder_out_proj = fluid.layers.fc(
-        input=encoder_out, size=decoder_size, bias_attr=False)
-
-    # 最大解码步数
-    max_len = fluid.layers.fill_constant(
-        shape=[1], dtype='int64', value=max_length)
-    # 解码步数计数变量
-    counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-    # 定义 tensor array 用以保存各个时间步的内容，并写入初始id，score和state
-    init_ids = fluid.layers.data(
-        name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = fluid.layers.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-    ids_array = fluid.layers.array_write(init_ids, i=counter)
-    scores_array = fluid.layers.array_write(init_scores, i=counter)
-    state_array = fluid.layers.array_write(encoder_last_proj, i=counter)
-
-    # 定义循环终止条件变量
-    cond = fluid.layers.less_than(x=counter, y=max_len)
-    while_op = fluid.layers.While(cond=cond)
-    with while_op.block():
-        # 获取解码器在当前步的输入，包括上一步选择的id，对应的score和上一步的state
-        pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
-        pre_score = fluid.layers.array_read(array=scores_array, i=counter)
-        pre_state = fluid.layers.array_read(array=state_array, i=counter)
-
-        # 同train_decoder中的内容，进行RNN的单步计算
-        pre_ids_emb = fluid.layers.embedding(
-            input=pre_ids,
-            size=[target_dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-        out, current_state = cell(pre_ids_emb, pre_state, encoder_out,
-                            encoder_out_proj)
-        prob = fluid.layers.fc(
-            input=current_state, size=target_dict_size, act='softmax')
-
-        # 计算累计得分，进行beam search
-        topk_scores, topk_indices = fluid.layers.topk(prob, k=beam_size)
-        accu_scores = fluid.layers.elementwise_add(
-            x=fluid.layers.log(topk_scores),
-            y=fluid.layers.reshape(pre_score, shape=[-1]),
-            axis=0)
-        accu_scores = fluid.layers.lod_reset(x=accu_scores, y=pre_ids)
-        selected_ids, selected_scores = fluid.layers.beam_search(
-            pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=1)
-
-        fluid.layers.increment(x=counter, value=1, in_place=True)
-        # 将 search 结果写入 tensor array 中
-        fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
-        fluid.layers.array_write(selected_scores, array=scores_array, i=counter)
-        # sequence_expand 作为 gather 使用以获取search结果对应的状态，并更新
-        current_state = fluid.layers.sequence_expand(current_state,
-                                                     selected_ids)
-        fluid.layers.array_write(current_state, array=state_array, i=counter)
-        current_enc_out = fluid.layers.sequence_expand(encoder_out,
-                                                       selected_ids)
-        fluid.layers.assign(current_enc_out, encoder_out)
-        current_enc_out_proj = fluid.layers.sequence_expand(
-            encoder_out_proj, selected_ids)
-        fluid.layers.assign(current_enc_out_proj, encoder_out_proj)
-
-        # 更新循环终止条件
-        length_cond = fluid.layers.less_than(x=counter, y=max_len)
-        finish_cond = fluid.layers.logical_not(
-            fluid.layers.is_empty(x=selected_ids))
-        fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-    # 根据保存的每一步的结果，回溯生成最终解码结果
-    translation_ids, translation_scores = fluid.layers.beam_search_decode(
-        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=1)
-
-    return translation_ids, translation_scores
-```
+### 定义数据生成器

-使用编码器和预测模式的解码器，预测网络定义如下：
+使用内置的`paddle.dataset.wmt16.train`接口定义数据生成器，其每次产生一条样本，shuffle和组完batch后对batch内的样本进行padding作为训练的输入；同时定义预测使用的数据生成器，如下：

 ```python
-def infer_model():
-    encoder_out = encoder()
-    translation_ids, translation_scores = infer_decoder(encoder_out)
-    return translation_ids, translation_scores
+def inputs_generator(batch_size, pad_id, is_train=True):
+    data_generator = fluid.io.shuffle(
+        paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
+        buf_size=10000) if is_train else paddle.dataset.wmt16.test(
+            source_dict_size, target_dict_size)
+    batch_generator = fluid.io.batch(data_generator, batch_size=batch_size)
+
+    # 对 batch 内的数据进行 padding
+    def _pad_batch_data(insts, pad_id):
+        seq_lengths = np.array(list(map(len, insts)), dtype="int64")
+        max_len = max(seq_lengths)
+        pad_data = np.array(
+            [inst + [pad_id] * (max_len - len(inst)) for inst in insts],
+            dtype="int64")
+        return pad_data, seq_lengths
+
+    def _generator():
+        for batch in batch_generator():
+            batch_src = [ins[0] for ins in batch]
+            src_data, src_lengths = _pad_batch_data(batch_src, pad_id)
+            inputs = [src_data, src_lengths]
+            if is_train:  #训练时包含 target 和 label 数据
+                batch_trg = [ins[1] for ins in batch]
+                trg_data, trg_lengths = _pad_batch_data(batch_trg, pad_id)
+                batch_lbl = [ins[2] for ins in batch]
+                lbl_data, _ = _pad_batch_data(batch_lbl, pad_id)
+                inputs += [trg_data, trg_lengths, lbl_data]
+            yield inputs
+
+    return _generator
 ```

-## 训练模型
-
 ### 构建训练程序

 定义用于训练的`Program`，在其中创建训练的网络结构并添加优化器。同时还要定义用于初始化的`Program`，在创建训练网络的同时隐式的加入参数初始化的操作。
@@ -445,40 +495,33 @@ train_prog = fluid.Program()
 startup_prog = fluid.Program()
 with fluid.program_guard(train_prog, startup_prog):
    with fluid.unique_name.guard():
-        avg_cost = train_model()
+        # 训练时：
+        # inputs = [src, src_sequence_length, trg, trg_sequence_length, label]
+        inputs, loader = data_func(is_train=True)
+        logits = model_func(inputs, is_train=True)
+        loss = loss_func(logits, inputs[-1], inputs[-2])
        optimizer = optimizer_func()
-        optimizer.minimize(avg_cost)
+        optimizer.minimize(loss)
 ```

-### 定义训练环境与执行器
+### 定义训练环境

-定义您的训练环境，可以指定训练是发生在CPU还是GPU上；并基于这个训练环境定义执行器。
+定义您的训练环境，包括指定所用的设备、绑定训练使用的数据源和定义执行器。

 ```python
+# 设置训练设备
 use_cuda = False
-# 定义使用设备和执行器
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-exe = fluid.Executor(place)
-```
-
-### 构建数据提供器
-
-使用封装的`paddle.dataset.wmt16.train`接口定义数据生成器，其每次产生一条样本，shuffle和组完batch后作为训练的输入；另外还需要指明输入数据中各字段和`data_layer`定义的各输入的对应关系，这可以通过`DataFeeder`完成, 下面的feeder将产生数据的第一列映射到`src_word_id`这个输入。
-
-```python
-# 定义训练数据生成器
-train_data = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
-        buf_size=10000),
-    batch_size=batch_size)
-# DataFeeder完成
-feeder = fluid.DataFeeder(
-    feed_list=[
-        'src_word_id', 'target_language_word', 'target_language_next_word'
-    ],
-    place=place,
-    program=train_prog)
+places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+# 设置数据源
+loader.set_batch_generator(inputs_generator(batch_size,
+                                            eos_id,
+                                            is_train=True),
+                            places=places)
+# 定义执行器，初始化参数并绑定Program
+exe = fluid.Executor(places[0])
+exe.run(startup_prog)
+prog = fluid.CompiledProgram(train_prog).with_data_parallel(
+    loss_name=loss.name)
 ```

 ### 训练主循环
@@ -486,17 +529,13 @@ feeder = fluid.DataFeeder(
 通过训练循环数（EPOCH_NUM）来进行训练循环，并且每次循环都保存训练好的参数。注意，循环训练前要首先执行初始化的`Program`来初始化参数。另外作为示例这里EPOCH_NUM设置较小，该数据集上实际大概需要20个epoch左右收敛。

 ```python
-# 执行初始化 Program，进行参数初始化
-exe.run(startup_prog)
-# 循环迭代执行训练
 EPOCH_NUM = 2
 for pass_id in six.moves.xrange(EPOCH_NUM):
    batch_id = 0
-    for data in train_data():
-        cost = exe.run(
-            train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0]
-        print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id,
-                                                        cost))
+    for data in loader():
+        loss_val = exe.run(prog, feed=data, fetch_list=[loss])[0]
+        print('pass_id: %d, batch_id: %d, loss: %f' %
+                (pass_id, batch_id, loss_val))
        batch_id += 1
    # 保存模型
    fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
@@ -513,81 +552,67 @@ infer_prog = fluid.Program()
 startup_prog = fluid.Program()
 with fluid.program_guard(infer_prog, startup_prog):
    with fluid.unique_name.guard():
-        translation_ids, translation_scores = infer_model()
+        inputs, loader = data_func(is_train=False)
+        predict_seqs = model_func(inputs, is_train=False)
 ```

-### 构建数据提供器
+### 定义预测环境

-和训练类似，这里使用封装的`paddle.dataset.wmt16.test`接口定义测试数据生成器，测试数据共1000条，组完batch后作为预测的输入；另外我们获取源语言和目标语言id到word的词典，以将id序列转换为明文序列打印输出。
+定义您的预测环境，和训练类似，包括指定所用的设备、绑定训练使用的数据源和定义执行器。

 ```python
-test_data = paddle.batch(
-    paddle.dataset.wmt16.test(source_dict_size, target_dict_size),
-    batch_size=batch_size)
-src_idx2word = paddle.dataset.wmt16.get_dict(
-    "en", source_dict_size, reverse=True)
-trg_idx2word = paddle.dataset.wmt16.get_dict(
-    "de", target_dict_size, reverse=True)
+use_cuda = False
+# 设置训练设备
+places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+# 设置数据源
+loader.set_batch_generator(inputs_generator(batch_size,
+                                            eos_id,
+                                            is_train=False),
+                            places=places)
+# 定义执行器，加载参数并绑定Program
+exe = fluid.Executor(places[0])
+exe.run(startup_prog)
+fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
+prog = fluid.CompiledProgram(infer_prog).with_data_parallel()
 ```

 ### 测试
-首先要加载训练过程保存下来的模型，然后就可以循环测试数据进行预测了。这里每次运行我们都会创建`data_layer`对应输入数据的`dict`传入，这个和`DataFeeder`相同的效果。生成过程对于每个测试数据都会将源语言句子和`beam_size`个生成句子打印输出。
+循环测试数据进行预测，生成过程对于每个测试数据都会将源语言句子和`beam_size`个生成句子打印输出，为打印出正确的句子还需要使用id到word映射的词典。如下：

 ```python
-    fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
-
-    for data in test_data():
-        src_word_id = fluid.create_lod_tensor(
-            data=[x[0] for x in data],
-            recursive_seq_lens=[[len(x[0]) for x in data]],
-            place=place)
-        # init_ids内容为start token
-        init_ids = fluid.create_lod_tensor(
-            data=np.array([[0]] * len(data), dtype='int64'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        # init_scores为beam search过程累积得分的初值
-        init_scores = fluid.create_lod_tensor(
-            data=np.array([[0.]] * len(data), dtype='float32'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        seq_ids, seq_scores = exe.run(
-            infer_prog,
-            feed={
-                'src_word_id': src_word_id,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
-            fetch_list=[translation_ids, translation_scores],
-            return_numpy=False)
-        # 如何解析翻译结果详见 train.py 中对应代码的注释说明
-        hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
-        scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
-        for i in range(len(seq_ids.lod()[0]) - 1):
-            start = seq_ids.lod()[0][i]
-            end = seq_ids.lod()[0][i + 1]
+# 获取 id 到 word 映射的词典
+src_idx2word = paddle.dataset.wmt16.get_dict(
+    "en", source_dict_size, reverse=True)
+trg_idx2word = paddle.dataset.wmt16.get_dict(
+    "de", target_dict_size, reverse=True)
+# 循环测试数据
+for data in loader():
+    seq_ids = exe.run(prog, feed=data, fetch_list=[predict_seqs])[0]
+    for ins_idx in range(seq_ids.shape[0]):
        print("Original sentence:")
-            print(" ".join([src_idx2word[idx] for idx in data[i][0][1:-1]]))
-            print("Translated score and sentence:")
-            for j in range(end - start):
-                sub_start = seq_ids.lod()[1][start + j]
-                sub_end = seq_ids.lod()[1][start + j + 1]
-                hyps[i].append(" ".join([
-                    trg_idx2word[idx]
-                    for idx in np.array(seq_ids)[sub_start:sub_end][1:-1]
+        src_seqs = np.array(data[0]["src"])
+        print(" ".join([
+            src_idx2word[idx] for idx in src_seqs[ins_idx][1:]
+            if idx != eos_id
        ]))
-                scores[i].append(np.array(seq_scores)[sub_end - 1])
-                print(scores[i][-1], hyps[i][-1].encode('utf8'))
+        print("Translated sentence:")
+        for beam_idx in range(beam_size):
+            seq = [
+                trg_idx2word[idx] for idx in seq_ids[ins_idx, :, beam_idx]
+                if idx != eos_id
+            ]
+            print(" ".join(seq).encode("utf8"))
 ```
+
 可以观察到如下的预测结果输出：
 ```txt
 Original sentence:
-Two adults and two children sit on a park bench .
+A man in an orange hat starring at something .
 Translated score and sentence:
-2.5993705 Zwei Erwachsene und zwei Kinder sitzen auf einer Parkbank .
-2.6617606 Zwei Erwachsene und zwei Kinder spielen auf einer Parkbank .
-3.186554 Zwei Erwachsene und zwei Kinder sitzen auf einer Bank .
-3.4353821 Zwei Erwachsene und zwei Kinder spielen auf einer Bank .
+Ein Mann mit einem orangen Schutzhelm starrt auf etwas .
+Ein Mann mit einem gelben Schutzhelm starrt auf etwas .
+Ein Mann mit einem gelben Schutzhelm starrt etwas an .
+Ein Mann mit einem orangen Schutzhelm starrt etwas an .
 ```

 ## 总结

--- a/08.machine_translation/index.cn.html
+++ b/08.machine_translation/index.cn.html
@@ -47,7 +47,7 @@
 ### 说明
 1. 硬件要求 本文可支持在CPU、GPU下运行
 2. 对docker file cuda/cudnn的支持 如果您使用了本文配套的docker镜像，请注意：该镜像对GPU的支持仅限于CUDA 8，cuDNN 5
-3. 文档中代码和train.py不一致的问题 请注意：为使本文更加易读易用，我们拆分、调整了train.py的代码并放入本文。本文中代码与train.py的运行结果一致，如希望直接看到训练脚本输出效果，可运行[train.py](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py)。
+3. 文档中代码和seq2seq.py不一致的问题 请注意：为使本文更加易读易用，我们拆分、调整了seq2seq.py的代码并放入本文。本文中代码与seq2seq.py的运行结果一致，如希望直接看到训练脚本输出效果，可运行[seq2seq.py](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/seq2seq.py)。

 ## 背景介绍

@@ -239,10 +239,14 @@ from __future__ import print_function
 import os
 import six

+import numpy as np
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers

 dict_size = 30000  # 词典大小
+bos_id = 0  # 词典中start token对应的id
+eos_id = 1  # 词典中end token对应的id
 source_dict_size = target_dict_size = dict_size  # 源/目标语言字典大小
 word_dim = 512  # 词向量维度
 hidden_dim = 512  # 编码器中的隐层大小
@@ -251,123 +255,226 @@ max_length = 256  # 解码生成句子的最大长度
 beam_size = 4  # beam search的柱宽度
 batch_size = 64  # batch 中的样本数

-is_sparse = True
 model_save_dir = "machine_translation.inference.model"
 ```

-然后如下实现编码器框架：
+接着定义所需要的数据输入：

 ```python
-def encoder():
-    # 定义源语言id序列的输入数据
-    src_word_id = fluid.layers.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    # 将上述编码映射到低维语言空间的词向量
-    src_embedding = fluid.layers.embedding(
-        input=src_word_id,
-        size=[source_dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse)
-    # 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到h
-    fc_forward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_forward = fluid.layers.dynamic_gru(input=fc_forward, size=hidden_dim)
-    fc_backward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_backward = fluid.layers.dynamic_gru(
-        input=fc_backward, size=hidden_dim, is_reverse=True)
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward, src_backward], axis=1)
-    return encoded_vector
+def data_func(is_train=True):
+    # 源语言source数据
+    src = fluid.data(name="src", shape=[None, None], dtype="int64")
+    src_sequence_length = fluid.data(name="src_sequence_length",
+                                     shape=[None],
+                                     dtype="int64")
+    inputs = [src, src_sequence_length]
+    # 训练时还需要目标语言target和label数据
+    if is_train:
+        trg = fluid.data(name="trg", shape=[None, None], dtype="int64")
+        trg_sequence_length = fluid.data(name="trg_sequence_length",
+                                        shape=[None],
+                                        dtype="int64")
+        label = fluid.data(name="label", shape=[None, None], dtype="int64")
+        inputs += [trg, trg_sequence_length, label]
+    # data loader
+    loader = fluid.io.DataLoader.from_generator(feed_list=inputs,
+                                                capacity=10,
+                                                iterable=True,
+                                                use_double_buffer=True)
+    return inputs, loader
+```
+
+然后如下实现使用双向GRU的编码器：
+
+```python
+def encoder(src_embedding, src_sequence_length):
+    # 使用GRUCell构建前向RNN
+    encoder_fwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_fwd_output, fwd_state = layers.rnn(
+        cell=encoder_fwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=False)
+    # 使用GRUCell构建反向RNN
+    encoder_bwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_bwd_output, bwd_state = layers.rnn(
+        cell=encoder_bwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=True)
+    # 拼接前向与反向GRU的编码结果得到h
+    encoder_output = layers.concat(
+        input=[encoder_fwd_output, encoder_bwd_output], axis=2)
+    encoder_state = layers.concat(input=[fwd_state, bwd_state], axis=1)
+    return encoder_output, encoder_state
 ```

 再实现基于注意力机制的解码器：
-  - 首先定义解码器中单步的计算，即$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$，如下：
+  - 首先通过 Cell 定义解码器中单步的计算，即$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$，这里使用 GRU 并加上注意力机制（Additive Attention），代码如下：

    ```python
-    # 定义RNN中的单步计算
-    def cell(x, hidden, encoder_out, encoder_out_proj):
+    class DecoderCell(layers.RNNCell):
+        def __init__(self, hidden_size):
+            self.hidden_size = hidden_size
+            self.gru_cell = layers.GRUCell(hidden_size)
+
+        def attention(self, hidden, encoder_output, encoder_output_proj,
+                      encoder_padding_mask):
            # 定义attention用以计算context，即 c_i，这里使用Bahdanau attention机制
-        def simple_attention(encoder_vec, encoder_proj, decoder_state):
-            decoder_state_proj = fluid.layers.fc(
-                input=decoder_state, size=decoder_size, bias_attr=False)
-            # sequence_expand将单步内容扩展为与encoder输出相同的序列
-            decoder_state_expand = fluid.layers.sequence_expand(
-                x=decoder_state_proj, y=encoder_proj)
-            mixed_state = fluid.layers.elementwise_add(encoder_proj,
-                                                    decoder_state_expand)
-            attention_weights = fluid.layers.fc(
-                input=mixed_state, size=1, bias_attr=False)
-            attention_weights = fluid.layers.sequence_softmax(
-                input=attention_weights)
-            weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
-            scaled = fluid.layers.elementwise_mul(
-                x=encoder_vec, y=weigths_reshape, axis=0)
-            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            decoder_state_proj = layers.unsqueeze(
+                layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1])
+            mixed_state = fluid.layers.elementwise_add(
+                encoder_output_proj,
+                layers.expand(decoder_state_proj,
+                            [1, layers.shape(decoder_state_proj)[1], 1]))
+            attn_scores = layers.squeeze(
+                layers.fc(input=mixed_state,
+                        size=1,
+                        num_flatten_dims=2,
+                        bias_attr=False), [2])
+            if encoder_padding_mask is not None:
+                attn_scores = layers.elementwise_add(attn_scores,
+                                                    encoder_padding_mask)
+            attn_scores = layers.softmax(attn_scores)
+            context = layers.reduce_sum(layers.elementwise_mul(encoder_output,
+                                                            attn_scores,
+                                                            axis=0),
+                                        dim=1)
            return context

-        context = simple_attention(encoder_out, encoder_out_proj, hidden)
-        out = fluid.layers.fc(
-            input=[x, context], size=decoder_size * 3, bias_attr=False)
-        out = fluid.layers.gru_unit(
-            input=out, hidden=hidden, size=decoder_size * 3)[0]
-        return out, out
+        def call(self,
+                 step_input,
+                 hidden,
+                 encoder_output,
+                 encoder_output_proj,
+                 encoder_padding_mask=None):
+            # Bahdanau attention
+            context = self.attention(hidden, encoder_output, encoder_output_proj,
+                                    encoder_padding_mask)
+            step_input = layers.concat([step_input, context], axis=1)
+            # GRU
+            output, new_hidden = self.gru_cell(step_input, hidden)
+            return output, new_hidden
    ```

-  - 基于定义的单步计算，使用`DynamicRNN`实现多步循环的训练模式下解码器，如下：
+  - 基于定义的单步计算，使用 `fluid.layers.rnn` 和 `fluid.layers.dynamic_decode` 分别实现用于训练和预测生成的多步循环解码器，如下：

    ```python
-    def train_decoder(encoder_out):
-        # 获取编码器输出的最后一步并进行非线性映射以构造解码器RNN的初始状态
-        encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-        encoder_last_proj = fluid.layers.fc(
-            input=encoder_last, size=decoder_size, act='tanh')
-        # 编码器输出在attention中计算结果的cache
-        encoder_out_proj = fluid.layers.fc(
-            input=encoder_out, size=decoder_size, bias_attr=False)
-
-        # 定义目标语言id序列的输入数据，并映射到低维语言空间的词向量
-        trg_language_word = fluid.layers.data(
-            name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-        trg_embedding = fluid.layers.embedding(
-            input=trg_language_word,
-            size=[target_dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-
-        rnn = fluid.layers.DynamicRNN()
-        with rnn.block():
-            # 获取当前步目标语言输入的词向量
-            x = rnn.step_input(trg_embedding)
-            # 获取隐层状态
-            pre_state = rnn.memory(init=encoder_last_proj, need_reorder=True)
-            # 在DynamicRNN中需使用static_input获取encoder相关的内容
-            # 对decoder来说这些内容在每个时间步都是固定的
-            encoder_out = rnn.static_input(encoder_out)
-            encoder_out_proj = rnn.static_input(encoder_out_proj)
-            # 执行单步的计算单元
-            out, current_state = cell(x, pre_state, encoder_out, encoder_out_proj)
-            # 计算归一化的单词预测概率
-            prob = fluid.layers.fc(input=out, size=target_dict_size, act='softmax')
-            # 更新隐层状态
-            rnn.update_memory(pre_state, current_state)
-            # 输出预测概率
-            rnn.output(prob)
-
-        return rnn()
+    def decoder(encoder_output,
+                encoder_output_proj,
+                encoder_state,
+                encoder_padding_mask,
+                trg=None,
+                is_train=True):
+        # 定义 RNN 所需要的组件
+        decoder_cell = DecoderCell(hidden_size=decoder_size)
+        decoder_initial_states = layers.fc(encoder_state,
+                                        size=decoder_size,
+                                        act="tanh")
+        trg_embeder = lambda x: fluid.embedding(input=x,
+                                                size=[target_dict_size, hidden_dim],
+                                                dtype="float32",
+                                                param_attr=fluid.ParamAttr(
+                                                    name="trg_emb_table"))
+        output_layer = lambda x: layers.fc(x,
+                                           size=target_dict_size,
+                                           num_flatten_dims=len(x.shape) - 1,
+                                           param_attr=fluid.ParamAttr(name=
+                                                                      "output_w"))
+        if is_train:  # 训练
+            # 训练时使用 `layers.rnn` 构造由 `cell` 指定的循环神经网络
+            # 循环的每一步从 `inputs` 中切片产生输入，并执行 `cell.call`
+            decoder_output, _ = layers.rnn(
+                cell=decoder_cell,
+                inputs=trg_embeder(trg),
+                initial_states=decoder_initial_states,
+                time_major=False,
+                encoder_output=encoder_output,
+                encoder_output_proj=encoder_output_proj,
+                encoder_padding_mask=encoder_padding_mask)
+            decoder_output = output_layer(decoder_output)
+        else:  # 基于 beam search 的预测生成
+            # beam search 时需要将用到的形为 `[batch_size, ...]` 的张量扩展为 `[batch_size* beam_size, ...]`
+            encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_output, beam_size)
+            encoder_output_proj = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_output_proj, beam_size)
+            encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_padding_mask, beam_size)
+            # BeamSearchDecoder 定义了单步解码的操作：`cell.call` + `beam_search_step`
+            beam_search_decoder = layers.BeamSearchDecoder(cell=decoder_cell,
+                                                           start_token=bos_id,
+                                                           end_token=eos_id,
+                                                           beam_size=beam_size,
+                                                           embedding_fn=trg_embeder,
+                                                           output_fn=output_layer)
+            # 使用 layers.dynamic_decode 动态解码
+            # 重复执行 `decoder.step()` 直到其返回的表示完成状态的张量中的值全部为True或解码步骤达到 `max_step_num`
+            decoder_output, _ = layers.dynamic_decode(
+                decoder=beam_search_decoder,
+                inits=decoder_initial_states,
+                max_step_num=max_length,
+                output_time_major=False,
+                encoder_output=encoder_output,
+                encoder_output_proj=encoder_output_proj,
+                encoder_padding_mask=encoder_padding_mask)
+
+        return decoder_output
    ```

-接着就可以使用编码器和解码器定义整个训练网络；为了进行训练还需要定义优化器，如下：
+接着就可以使用编码器和解码器定义整个网络，如下：
+
+```python
+def model_func(inputs, is_train=True):
+    # 源语言输入
+    src = inputs[0]
+    src_sequence_length = inputs[1]
+    src_embeder = lambda x: fluid.embedding(
+        input=x,
+        size=[source_dict_size, hidden_dim],
+        dtype="float32",
+        param_attr=fluid.ParamAttr(name="src_emb_table"))
+    src_embedding = src_embeder(src)
+
+    # 编码器
+    encoder_output, encoder_state = encoder(src_embedding, src_sequence_length)
+
+    encoder_output_proj = layers.fc(input=encoder_output,
+                                    size=decoder_size,
+                                    num_flatten_dims=2,
+                                    bias_attr=False)
+    src_mask = layers.sequence_mask(src_sequence_length,
+                                    maxlen=layers.shape(src)[1],
+                                    dtype="float32")
+    encoder_padding_mask = (src_mask - 1.0) * 1e9
+
+    # 目标语言输入，训练时有、预测生成时无该输入
+    trg = inputs[2] if is_train else None
+
+    # 解码器
+    output = decoder(encoder_output=encoder_output,
+                     encoder_output_proj=encoder_output_proj,
+                     encoder_state=encoder_state,
+                     encoder_padding_mask=encoder_padding_mask,
+                     trg=trg,
+                     is_train=is_train)
+    return output
+```
+
+为了进行训练还需要定义损失函数和优化器，如下：

 ```python
-def train_model():
-    encoder_out = encoder()
-    rnn_out = train_decoder(encoder_out)
-    label = fluid.layers.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    # 定义损失函数
-    cost = fluid.layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
+def loss_func(logits, label, trg_sequence_length):
+    probs = layers.softmax(logits)
+    # 使用交叉熵损失函数
+    loss = layers.cross_entropy(input=probs, label=label)
+    # 根据长度生成掩码，并依此剔除 padding 部分计算的损失
+    trg_mask = layers.sequence_mask(trg_sequence_length,
+                                    maxlen=layers.shape(logits)[1],
+                                    dtype="float32")
+    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
    return avg_cost

 def optimizer_func():
@@ -382,102 +489,45 @@ def optimizer_func():
            regularization_coeff=1e-4))
 ```

-以上是训练所需的模型构件，预测（生成）模式下基于beam search的解码器需要借助`while_op`实现，如下：
+## 训练模型

-```python
-def infer_decoder(encoder_out):
-    # 获取编码器输出的最后一步并进行非线性映射以构造解码器RNN的初始状态
-    encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-    encoder_last_proj = fluid.layers.fc(
-        input=encoder_last, size=decoder_size, act='tanh')
-    # 编码器输出在attention中计算结果的cache
-    encoder_out_proj = fluid.layers.fc(
-        input=encoder_out, size=decoder_size, bias_attr=False)
-
-    # 最大解码步数
-    max_len = fluid.layers.fill_constant(
-        shape=[1], dtype='int64', value=max_length)
-    # 解码步数计数变量
-    counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-    # 定义 tensor array 用以保存各个时间步的内容，并写入初始id，score和state
-    init_ids = fluid.layers.data(
-        name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = fluid.layers.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-    ids_array = fluid.layers.array_write(init_ids, i=counter)
-    scores_array = fluid.layers.array_write(init_scores, i=counter)
-    state_array = fluid.layers.array_write(encoder_last_proj, i=counter)
-
-    # 定义循环终止条件变量
-    cond = fluid.layers.less_than(x=counter, y=max_len)
-    while_op = fluid.layers.While(cond=cond)
-    with while_op.block():
-        # 获取解码器在当前步的输入，包括上一步选择的id，对应的score和上一步的state
-        pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
-        pre_score = fluid.layers.array_read(array=scores_array, i=counter)
-        pre_state = fluid.layers.array_read(array=state_array, i=counter)
-
-        # 同train_decoder中的内容，进行RNN的单步计算
-        pre_ids_emb = fluid.layers.embedding(
-            input=pre_ids,
-            size=[target_dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-        out, current_state = cell(pre_ids_emb, pre_state, encoder_out,
-                            encoder_out_proj)
-        prob = fluid.layers.fc(
-            input=current_state, size=target_dict_size, act='softmax')
-
-        # 计算累计得分，进行beam search
-        topk_scores, topk_indices = fluid.layers.topk(prob, k=beam_size)
-        accu_scores = fluid.layers.elementwise_add(
-            x=fluid.layers.log(topk_scores),
-            y=fluid.layers.reshape(pre_score, shape=[-1]),
-            axis=0)
-        accu_scores = fluid.layers.lod_reset(x=accu_scores, y=pre_ids)
-        selected_ids, selected_scores = fluid.layers.beam_search(
-            pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=1)
-
-        fluid.layers.increment(x=counter, value=1, in_place=True)
-        # 将 search 结果写入 tensor array 中
-        fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
-        fluid.layers.array_write(selected_scores, array=scores_array, i=counter)
-        # sequence_expand 作为 gather 使用以获取search结果对应的状态，并更新
-        current_state = fluid.layers.sequence_expand(current_state,
-                                                     selected_ids)
-        fluid.layers.array_write(current_state, array=state_array, i=counter)
-        current_enc_out = fluid.layers.sequence_expand(encoder_out,
-                                                       selected_ids)
-        fluid.layers.assign(current_enc_out, encoder_out)
-        current_enc_out_proj = fluid.layers.sequence_expand(
-            encoder_out_proj, selected_ids)
-        fluid.layers.assign(current_enc_out_proj, encoder_out_proj)
-
-        # 更新循环终止条件
-        length_cond = fluid.layers.less_than(x=counter, y=max_len)
-        finish_cond = fluid.layers.logical_not(
-            fluid.layers.is_empty(x=selected_ids))
-        fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-    # 根据保存的每一步的结果，回溯生成最终解码结果
-    translation_ids, translation_scores = fluid.layers.beam_search_decode(
-        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=1)
-
-    return translation_ids, translation_scores
-```
+### 定义数据生成器

-使用编码器和预测模式的解码器，预测网络定义如下：
+使用内置的`paddle.dataset.wmt16.train`接口定义数据生成器，其每次产生一条样本，shuffle和组完batch后对batch内的样本进行padding作为训练的输入；同时定义预测使用的数据生成器，如下：

 ```python
-def infer_model():
-    encoder_out = encoder()
-    translation_ids, translation_scores = infer_decoder(encoder_out)
-    return translation_ids, translation_scores
+def inputs_generator(batch_size, pad_id, is_train=True):
+    data_generator = fluid.io.shuffle(
+        paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
+        buf_size=10000) if is_train else paddle.dataset.wmt16.test(
+            source_dict_size, target_dict_size)
+    batch_generator = fluid.io.batch(data_generator, batch_size=batch_size)
+
+    # 对 batch 内的数据进行 padding
+    def _pad_batch_data(insts, pad_id):
+        seq_lengths = np.array(list(map(len, insts)), dtype="int64")
+        max_len = max(seq_lengths)
+        pad_data = np.array(
+            [inst + [pad_id] * (max_len - len(inst)) for inst in insts],
+            dtype="int64")
+        return pad_data, seq_lengths
+
+    def _generator():
+        for batch in batch_generator():
+            batch_src = [ins[0] for ins in batch]
+            src_data, src_lengths = _pad_batch_data(batch_src, pad_id)
+            inputs = [src_data, src_lengths]
+            if is_train:  #训练时包含 target 和 label 数据
+                batch_trg = [ins[1] for ins in batch]
+                trg_data, trg_lengths = _pad_batch_data(batch_trg, pad_id)
+                batch_lbl = [ins[2] for ins in batch]
+                lbl_data, _ = _pad_batch_data(batch_lbl, pad_id)
+                inputs += [trg_data, trg_lengths, lbl_data]
+            yield inputs
+
+    return _generator
 ```

-## 训练模型
-
 ### 构建训练程序

 定义用于训练的`Program`，在其中创建训练的网络结构并添加优化器。同时还要定义用于初始化的`Program`，在创建训练网络的同时隐式的加入参数初始化的操作。
@@ -487,40 +537,33 @@ train_prog = fluid.Program()
 startup_prog = fluid.Program()
 with fluid.program_guard(train_prog, startup_prog):
    with fluid.unique_name.guard():
-        avg_cost = train_model()
+        # 训练时：
+        # inputs = [src, src_sequence_length, trg, trg_sequence_length, label]
+        inputs, loader = data_func(is_train=True)
+        logits = model_func(inputs, is_train=True)
+        loss = loss_func(logits, inputs[-1], inputs[-2])
        optimizer = optimizer_func()
-        optimizer.minimize(avg_cost)
+        optimizer.minimize(loss)
 ```

-### 定义训练环境与执行器
+### 定义训练环境

-定义您的训练环境，可以指定训练是发生在CPU还是GPU上；并基于这个训练环境定义执行器。
+定义您的训练环境，包括指定所用的设备、绑定训练使用的数据源和定义执行器。

 ```python
+# 设置训练设备
 use_cuda = False
-# 定义使用设备和执行器
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-exe = fluid.Executor(place)
-```
-
-### 构建数据提供器
-
-使用封装的`paddle.dataset.wmt16.train`接口定义数据生成器，其每次产生一条样本，shuffle和组完batch后作为训练的输入；另外还需要指明输入数据中各字段和`data_layer`定义的各输入的对应关系，这可以通过`DataFeeder`完成, 下面的feeder将产生数据的第一列映射到`src_word_id`这个输入。
-
-```python
-# 定义训练数据生成器
-train_data = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
-        buf_size=10000),
-    batch_size=batch_size)
-# DataFeeder完成
-feeder = fluid.DataFeeder(
-    feed_list=[
-        'src_word_id', 'target_language_word', 'target_language_next_word'
-    ],
-    place=place,
-    program=train_prog)
+places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+# 设置数据源
+loader.set_batch_generator(inputs_generator(batch_size,
+                                            eos_id,
+                                            is_train=True),
+                            places=places)
+# 定义执行器，初始化参数并绑定Program
+exe = fluid.Executor(places[0])
+exe.run(startup_prog)
+prog = fluid.CompiledProgram(train_prog).with_data_parallel(
+    loss_name=loss.name)
 ```

 ### 训练主循环
@@ -528,17 +571,13 @@ feeder = fluid.DataFeeder(
 通过训练循环数（EPOCH_NUM）来进行训练循环，并且每次循环都保存训练好的参数。注意，循环训练前要首先执行初始化的`Program`来初始化参数。另外作为示例这里EPOCH_NUM设置较小，该数据集上实际大概需要20个epoch左右收敛。

 ```python
-# 执行初始化 Program，进行参数初始化
-exe.run(startup_prog)
-# 循环迭代执行训练
 EPOCH_NUM = 2
 for pass_id in six.moves.xrange(EPOCH_NUM):
    batch_id = 0
-    for data in train_data():
-        cost = exe.run(
-            train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0]
-        print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id,
-                                                        cost))
+    for data in loader():
+        loss_val = exe.run(prog, feed=data, fetch_list=[loss])[0]
+        print('pass_id: %d, batch_id: %d, loss: %f' %
+                (pass_id, batch_id, loss_val))
        batch_id += 1
    # 保存模型
    fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
@@ -555,81 +594,67 @@ infer_prog = fluid.Program()
 startup_prog = fluid.Program()
 with fluid.program_guard(infer_prog, startup_prog):
    with fluid.unique_name.guard():
-        translation_ids, translation_scores = infer_model()
+        inputs, loader = data_func(is_train=False)
+        predict_seqs = model_func(inputs, is_train=False)
 ```

-### 构建数据提供器
+### 定义预测环境

-和训练类似，这里使用封装的`paddle.dataset.wmt16.test`接口定义测试数据生成器，测试数据共1000条，组完batch后作为预测的输入；另外我们获取源语言和目标语言id到word的词典，以将id序列转换为明文序列打印输出。
+定义您的预测环境，和训练类似，包括指定所用的设备、绑定训练使用的数据源和定义执行器。

 ```python
-test_data = paddle.batch(
-    paddle.dataset.wmt16.test(source_dict_size, target_dict_size),
-    batch_size=batch_size)
-src_idx2word = paddle.dataset.wmt16.get_dict(
-    "en", source_dict_size, reverse=True)
-trg_idx2word = paddle.dataset.wmt16.get_dict(
-    "de", target_dict_size, reverse=True)
+use_cuda = False
+# 设置训练设备
+places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+# 设置数据源
+loader.set_batch_generator(inputs_generator(batch_size,
+                                            eos_id,
+                                            is_train=False),
+                            places=places)
+# 定义执行器，加载参数并绑定Program
+exe = fluid.Executor(places[0])
+exe.run(startup_prog)
+fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
+prog = fluid.CompiledProgram(infer_prog).with_data_parallel()
 ```

 ### 测试
-首先要加载训练过程保存下来的模型，然后就可以循环测试数据进行预测了。这里每次运行我们都会创建`data_layer`对应输入数据的`dict`传入，这个和`DataFeeder`相同的效果。生成过程对于每个测试数据都会将源语言句子和`beam_size`个生成句子打印输出。
+循环测试数据进行预测，生成过程对于每个测试数据都会将源语言句子和`beam_size`个生成句子打印输出，为打印出正确的句子还需要使用id到word映射的词典。如下：

 ```python
-    fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
-
-    for data in test_data():
-        src_word_id = fluid.create_lod_tensor(
-            data=[x[0] for x in data],
-            recursive_seq_lens=[[len(x[0]) for x in data]],
-            place=place)
-        # init_ids内容为start token
-        init_ids = fluid.create_lod_tensor(
-            data=np.array([[0]] * len(data), dtype='int64'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        # init_scores为beam search过程累积得分的初值
-        init_scores = fluid.create_lod_tensor(
-            data=np.array([[0.]] * len(data), dtype='float32'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        seq_ids, seq_scores = exe.run(
-            infer_prog,
-            feed={
-                'src_word_id': src_word_id,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
-            fetch_list=[translation_ids, translation_scores],
-            return_numpy=False)
-        # 如何解析翻译结果详见 train.py 中对应代码的注释说明
-        hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
-        scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
-        for i in range(len(seq_ids.lod()[0]) - 1):
-            start = seq_ids.lod()[0][i]
-            end = seq_ids.lod()[0][i + 1]
+# 获取 id 到 word 映射的词典
+src_idx2word = paddle.dataset.wmt16.get_dict(
+    "en", source_dict_size, reverse=True)
+trg_idx2word = paddle.dataset.wmt16.get_dict(
+    "de", target_dict_size, reverse=True)
+# 循环测试数据
+for data in loader():
+    seq_ids = exe.run(prog, feed=data, fetch_list=[predict_seqs])[0]
+    for ins_idx in range(seq_ids.shape[0]):
        print("Original sentence:")
-            print(" ".join([src_idx2word[idx] for idx in data[i][0][1:-1]]))
-            print("Translated score and sentence:")
-            for j in range(end - start):
-                sub_start = seq_ids.lod()[1][start + j]
-                sub_end = seq_ids.lod()[1][start + j + 1]
-                hyps[i].append(" ".join([
-                    trg_idx2word[idx]
-                    for idx in np.array(seq_ids)[sub_start:sub_end][1:-1]
+        src_seqs = np.array(data[0]["src"])
+        print(" ".join([
+            src_idx2word[idx] for idx in src_seqs[ins_idx][1:]
+            if idx != eos_id
        ]))
-                scores[i].append(np.array(seq_scores)[sub_end - 1])
-                print(scores[i][-1], hyps[i][-1].encode('utf8'))
+        print("Translated sentence:")
+        for beam_idx in range(beam_size):
+            seq = [
+                trg_idx2word[idx] for idx in seq_ids[ins_idx, :, beam_idx]
+                if idx != eos_id
+            ]
+            print(" ".join(seq).encode("utf8"))
 ```
+
 可以观察到如下的预测结果输出：
 ```txt
 Original sentence:
-Two adults and two children sit on a park bench .
+A man in an orange hat starring at something .
 Translated score and sentence:
-2.5993705 Zwei Erwachsene und zwei Kinder sitzen auf einer Parkbank .
-2.6617606 Zwei Erwachsene und zwei Kinder spielen auf einer Parkbank .
-3.186554 Zwei Erwachsene und zwei Kinder sitzen auf einer Bank .
-3.4353821 Zwei Erwachsene und zwei Kinder spielen auf einer Bank .
+Ein Mann mit einem orangen Schutzhelm starrt auf etwas .
+Ein Mann mit einem gelben Schutzhelm starrt auf etwas .
+Ein Mann mit einem gelben Schutzhelm starrt etwas an .
+Ein Mann mit einem orangen Schutzhelm starrt etwas an .
 ```

 ## 总结

--- a/08.machine_translation/seq2seq.py
+++ b/08.machine_translation/seq2seq.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import six
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+dict_size = 30000
+source_dict_size = target_dict_size = dict_size
+bos_id = 0
+eos_id = 1
+word_dim = 512
+hidden_dim = 512
+decoder_size = hidden_dim
+max_length = 256
+beam_size = 4
+batch_size = 64
+
+model_save_dir = "machine_translation.inference.model"
+
+
+class DecoderCell(layers.RNNCell):
+    """Additive Attention followed by GRU"""
+
+    def __init__(self, hidden_size):
+        self.hidden_size = hidden_size
+        self.gru_cell = layers.GRUCell(hidden_size)
+
+    def attention(self, hidden, encoder_output, encoder_output_proj,
+                  encoder_padding_mask):
+        decoder_state_proj = layers.unsqueeze(
+            layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1])
+        mixed_state = fluid.layers.elementwise_add(
+            encoder_output_proj,
+            layers.expand(decoder_state_proj,
+                          [1, layers.shape(decoder_state_proj)[1], 1]))
+        # attn_scores: [batch_size, src_seq_len]
+        attn_scores = layers.squeeze(
+            layers.fc(
+                input=mixed_state, size=1, num_flatten_dims=2, bias_attr=False),
+            [2])
+        if encoder_padding_mask is not None:
+            attn_scores = layers.elementwise_add(attn_scores,
+                                                 encoder_padding_mask)
+        attn_scores = layers.softmax(attn_scores)
+        context = layers.reduce_sum(
+            layers.elementwise_mul(encoder_output, attn_scores, axis=0), dim=1)
+        return context
+
+    def call(self,
+             step_input,
+             hidden,
+             encoder_output,
+             encoder_output_proj,
+             encoder_padding_mask=None):
+        context = self.attention(hidden, encoder_output, encoder_output_proj,
+                                 encoder_padding_mask)
+        step_input = layers.concat([step_input, context], axis=1)
+        output, new_hidden = self.gru_cell(step_input, hidden)
+        return output, new_hidden
+
+
+def data_func(is_train=True):
+    """data inputs and data loader"""
+    src = fluid.data(name="src", shape=[None, None], dtype="int64")
+    src_sequence_length = fluid.data(
+        name="src_sequence_length", shape=[None], dtype="int64")
+    inputs = [src, src_sequence_length]
+    if is_train:
+        trg = fluid.data(name="trg", shape=[None, None], dtype="int64")
+        trg_sequence_length = fluid.data(
+            name="trg_sequence_length", shape=[None], dtype="int64")
+        label = fluid.data(name="label", shape=[None, None], dtype="int64")
+        inputs += [trg, trg_sequence_length, label]
+    loader = fluid.io.DataLoader.from_generator(
+        feed_list=inputs, capacity=10, iterable=True, use_double_buffer=True)
+    return inputs, loader
+
+
+def encoder(src_embedding, src_sequence_length):
+    """Encoder: Bidirectional GRU"""
+    encoder_fwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_fwd_output, fwd_state = layers.rnn(
+        cell=encoder_fwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=False)
+    encoder_bwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_bwd_output, bwd_state = layers.rnn(
+        cell=encoder_bwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=True)
+    encoder_output = layers.concat(
+        input=[encoder_fwd_output, encoder_bwd_output], axis=2)
+    encoder_state = layers.concat(input=[fwd_state, bwd_state], axis=1)
+    return encoder_output, encoder_state
+
+
+def decoder(encoder_output,
+            encoder_output_proj,
+            encoder_state,
+            encoder_padding_mask,
+            trg=None,
+            is_train=True):
+    """Decoder: GRU with Attention"""
+    decoder_cell = DecoderCell(hidden_size=decoder_size)
+    decoder_initial_states = layers.fc(
+        encoder_state, size=decoder_size, act="tanh")
+    trg_embeder = lambda x: fluid.embedding(input=x,
+                                            size=[target_dict_size, hidden_dim],
+                                            dtype="float32",
+                                            param_attr=fluid.ParamAttr(
+                                                name="trg_emb_table"))
+    output_layer = lambda x: layers.fc(x,
+                                       size=target_dict_size,
+                                       num_flatten_dims=len(x.shape) - 1,
+                                       param_attr=fluid.ParamAttr(name=
+                                                                  "output_w"))
+    if is_train:
+        decoder_output, _ = layers.rnn(
+            cell=decoder_cell,
+            inputs=trg_embeder(trg),
+            initial_states=decoder_initial_states,
+            time_major=False,
+            encoder_output=encoder_output,
+            encoder_output_proj=encoder_output_proj,
+            encoder_padding_mask=encoder_padding_mask)
+        decoder_output = output_layer(decoder_output)
+    else:
+        encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+            encoder_output, beam_size)
+        encoder_output_proj = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+            encoder_output_proj, beam_size)
+        encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+            encoder_padding_mask, beam_size)
+        beam_search_decoder = layers.BeamSearchDecoder(
+            cell=decoder_cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=trg_embeder,
+            output_fn=output_layer)
+        decoder_output, _ = layers.dynamic_decode(
+            decoder=beam_search_decoder,
+            inits=decoder_initial_states,
+            max_step_num=max_length,
+            output_time_major=False,
+            encoder_output=encoder_output,
+            encoder_output_proj=encoder_output_proj,
+            encoder_padding_mask=encoder_padding_mask)
+
+    return decoder_output
+
+
+def model_func(inputs, is_train=True):
+    src = inputs[0]
+    src_sequence_length = inputs[1]
+    # source embedding
+    src_embeder = lambda x: fluid.embedding(
+        input=x,
+        size=[source_dict_size, hidden_dim],
+        dtype="float32",
+        param_attr=fluid.ParamAttr(name="src_emb_table"))
+    src_embedding = src_embeder(src)
+
+    # encoder
+    encoder_output, encoder_state = encoder(src_embedding, src_sequence_length)
+
+    encoder_output_proj = layers.fc(
+        input=encoder_output,
+        size=decoder_size,
+        num_flatten_dims=2,
+        bias_attr=False)
+    src_mask = layers.sequence_mask(
+        src_sequence_length, maxlen=layers.shape(src)[1], dtype="float32")
+    encoder_padding_mask = (src_mask - 1.0) * 1e9
+
+    trg = inputs[2] if is_train else None
+
+    # decoder
+    output = decoder(
+        encoder_output=encoder_output,
+        encoder_output_proj=encoder_output_proj,
+        encoder_state=encoder_state,
+        encoder_padding_mask=encoder_padding_mask,
+        trg=trg,
+        is_train=is_train)
+    return output
+
+
+def loss_func(logits, label, trg_sequence_length):
+    probs = layers.softmax(logits)
+    loss = layers.cross_entropy(input=probs, label=label)
+    trg_mask = layers.sequence_mask(
+        trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32")
+    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
+    return avg_cost
+
+
+def optimizer_func():
+    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
+        clip_norm=5.0))
+    lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(hidden_dim, 1000)
+    return fluid.optimizer.Adam(
+        learning_rate=lr_decay,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=1e-4))
+
+
+def inputs_generator(batch_size, pad_id, is_train=True):
+    data_generator = fluid.io.shuffle(
+        paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
+        buf_size=10000) if is_train else paddle.dataset.wmt16.test(
+            source_dict_size, target_dict_size)
+    batch_generator = fluid.io.batch(data_generator, batch_size=batch_size)
+
+    def _pad_batch_data(insts, pad_id):
+        seq_lengths = np.array(list(map(len, insts)), dtype="int64")
+        max_len = max(seq_lengths)
+        pad_data = np.array(
+            [inst + [pad_id] * (max_len - len(inst)) for inst in insts],
+            dtype="int64")
+        return pad_data, seq_lengths
+
+    def _generator():
+        for batch in batch_generator():
+            batch_src = [ins[0] for ins in batch]
+            src_data, src_lengths = _pad_batch_data(batch_src, pad_id)
+            inputs = [src_data, src_lengths]
+            if is_train:
+                batch_trg = [ins[1] for ins in batch]
+                trg_data, trg_lengths = _pad_batch_data(batch_trg, pad_id)
+                batch_lbl = [ins[2] for ins in batch]
+                lbl_data, _ = _pad_batch_data(batch_lbl, pad_id)
+                inputs += [trg_data, trg_lengths, lbl_data]
+            yield inputs
+
+    return _generator
+
+
+def train(use_cuda):
+    # define program
+    train_prog = fluid.Program()
+    startup_prog = fluid.Program()
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            # For training:
+            # inputs = [src, src_sequence_length, trg, trg_sequence_length, label]
+            inputs, loader = data_func(is_train=True)
+            logits = model_func(inputs, is_train=True)
+            loss = loss_func(logits, inputs[-1], inputs[-2])
+            optimizer = optimizer_func()
+            optimizer.minimize(loss)
+
+    # define data source
+    places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+    loader.set_batch_generator(
+        inputs_generator(batch_size, eos_id, is_train=True), places=places)
+
+    exe = fluid.Executor(places[0])
+    exe.run(startup_prog)
+    prog = fluid.CompiledProgram(train_prog).with_data_parallel(
+        loss_name=loss.name)
+
+    EPOCH_NUM = 20
+    for pass_id in six.moves.xrange(EPOCH_NUM):
+        batch_id = 0
+        for data in loader():
+            loss_val = exe.run(prog, feed=data, fetch_list=[loss])[0]
+            print('pass_id: %d, batch_id: %d, loss: %f' %
+                  (pass_id, batch_id, loss_val))
+            batch_id += 1
+        fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
+
+
+def infer(use_cuda):
+    # define program
+    infer_prog = fluid.Program()
+    startup_prog = fluid.Program()
+    with fluid.program_guard(infer_prog, startup_prog):
+        with fluid.unique_name.guard():
+            inputs, loader = data_func(is_train=False)
+            predict_seqs = model_func(inputs, is_train=False)
+
+    # define data source
+    places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+    loader.set_batch_generator(
+        inputs_generator(batch_size, eos_id, is_train=False), places=places)
+    src_idx2word = paddle.dataset.wmt16.get_dict(
+        "en", source_dict_size, reverse=True)
+    trg_idx2word = paddle.dataset.wmt16.get_dict(
+        "de", target_dict_size, reverse=True)
+
+    exe = fluid.Executor(places[0])
+    exe.run(startup_prog)
+    fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
+    prog = fluid.CompiledProgram(infer_prog).with_data_parallel()
+
+    for data in loader():
+        seq_ids = exe.run(prog, feed=data, fetch_list=[predict_seqs])[0]
+        for ins_idx in range(seq_ids.shape[0]):
+            print("Original sentence:")
+            src_seqs = np.array(data[0]["src"])
+            print(" ".join([
+                src_idx2word[idx] for idx in src_seqs[ins_idx][1:]
+                if idx != eos_id
+            ]))
+            print("Translated sentence:")
+            for beam_idx in range(beam_size):
+                seq = [
+                    trg_idx2word[idx] for idx in seq_ids[ins_idx, :, beam_idx]
+                    if idx != eos_id
+                ]
+                print(" ".join(seq).encode("utf8"))
+
+
+def main(use_cuda):
+    train(use_cuda)
+    infer(use_cuda)
+
+
+if __name__ == '__main__':
+    use_cuda = False  # set to True if training with GPU
+    main(use_cuda)
--- a/08.machine_translation/train.py
+++ b/08.machine_translation/train.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import os
-import six
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-dict_size = 30000
-source_dict_size = target_dict_size = dict_size
-word_dim = 512
-hidden_dim = 512
-decoder_size = hidden_dim
-max_length = 256
-beam_size = 4
-batch_size = 64
-
-is_sparse = True
-model_save_dir = "machine_translation.inference.model"
-
-
-def encoder():
-    src_word_id = fluid.layers.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = fluid.layers.embedding(
-        input=src_word_id,
-        size=[source_dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse)
-
-    fc_forward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_forward = fluid.layers.dynamic_gru(input=fc_forward, size=hidden_dim)
-    fc_backward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_backward = fluid.layers.dynamic_gru(
-        input=fc_backward, size=hidden_dim, is_reverse=True)
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward, src_backward], axis=1)
-    return encoded_vector
-
-
-def cell(x, hidden, encoder_out, encoder_out_proj):
-    def simple_attention(encoder_vec, encoder_proj, decoder_state):
-        decoder_state_proj = fluid.layers.fc(
-            input=decoder_state, size=decoder_size, bias_attr=False)
-        decoder_state_expand = fluid.layers.sequence_expand(
-            x=decoder_state_proj, y=encoder_proj)
-        mixed_state = fluid.layers.elementwise_add(encoder_proj,
-                                                   decoder_state_expand)
-        attention_weights = fluid.layers.fc(
-            input=mixed_state, size=1, bias_attr=False)
-        attention_weights = fluid.layers.sequence_softmax(
-            input=attention_weights)
-        weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
-        scaled = fluid.layers.elementwise_mul(
-            x=encoder_vec, y=weigths_reshape, axis=0)
-        context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
-        return context
-
-    context = simple_attention(encoder_out, encoder_out_proj, hidden)
-    out = fluid.layers.fc(
-        input=[x, context], size=decoder_size * 3, bias_attr=False)
-    out = fluid.layers.gru_unit(
-        input=out, hidden=hidden, size=decoder_size * 3)[0]
-    return out, out
-
-
-def train_decoder(encoder_out):
-    encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-    encoder_last_proj = fluid.layers.fc(
-        input=encoder_last, size=decoder_size, act='tanh')
-    # cache the encoder_out's computed result in attention
-    encoder_out_proj = fluid.layers.fc(
-        input=encoder_out, size=decoder_size, bias_attr=False)
-
-    trg_language_word = fluid.layers.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = fluid.layers.embedding(
-        input=trg_language_word,
-        size=[target_dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse)
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        x = rnn.step_input(trg_embedding)
-        pre_state = rnn.memory(init=encoder_last_proj, need_reorder=True)
-        encoder_out = rnn.static_input(encoder_out)
-        encoder_out_proj = rnn.static_input(encoder_out_proj)
-        out, current_state = cell(x, pre_state, encoder_out, encoder_out_proj)
-        prob = fluid.layers.fc(input=out, size=target_dict_size, act='softmax')
-
-        rnn.update_memory(pre_state, current_state)
-        rnn.output(prob)
-
-    return rnn()
-
-
-def train_model():
-    encoder_out = encoder()
-    rnn_out = train_decoder(encoder_out)
-    label = fluid.layers.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = fluid.layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    return avg_cost
-
-
-def optimizer_func():
-    fluid.clip.set_gradient_clip(
-        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
-    lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(hidden_dim, 1000)
-    return fluid.optimizer.Adam(
-        learning_rate=lr_decay,
-        regularization=fluid.regularizer.L2DecayRegularizer(
-            regularization_coeff=1e-4))
-
-
-def train(use_cuda):
-    train_prog = fluid.Program()
-    startup_prog = fluid.Program()
-    with fluid.program_guard(train_prog, startup_prog):
-        with fluid.unique_name.guard():
-            avg_cost = train_model()
-            optimizer = optimizer_func()
-            optimizer.minimize(avg_cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
-            buf_size=10000),
-        batch_size=batch_size)
-
-    feeder = fluid.DataFeeder(
-        feed_list=[
-            'src_word_id', 'target_language_word', 'target_language_next_word'
-        ],
-        place=place,
-        program=train_prog)
-
-    exe.run(startup_prog)
-
-    EPOCH_NUM = 20
-    for pass_id in six.moves.xrange(EPOCH_NUM):
-        batch_id = 0
-        for data in train_data():
-            cost = exe.run(
-                train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0]
-            print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id,
-                                                           cost))
-            batch_id += 1
-        fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
-
-
-def infer_decoder(encoder_out):
-    encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-    encoder_last_proj = fluid.layers.fc(
-        input=encoder_last, size=decoder_size, act='tanh')
-    encoder_out_proj = fluid.layers.fc(
-        input=encoder_out, size=decoder_size, bias_attr=False)
-
-    max_len = fluid.layers.fill_constant(
-        shape=[1], dtype='int64', value=max_length)
-    counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-    init_ids = fluid.layers.data(
-        name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = fluid.layers.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-    # create and init arrays to save selected ids, scores and states for each step
-    ids_array = fluid.layers.array_write(init_ids, i=counter)
-    scores_array = fluid.layers.array_write(init_scores, i=counter)
-    state_array = fluid.layers.array_write(encoder_last_proj, i=counter)
-
-    cond = fluid.layers.less_than(x=counter, y=max_len)
-    while_op = fluid.layers.While(cond=cond)
-    with while_op.block():
-        pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
-        pre_score = fluid.layers.array_read(array=scores_array, i=counter)
-        pre_state = fluid.layers.array_read(array=state_array, i=counter)
-
-        pre_ids_emb = fluid.layers.embedding(
-            input=pre_ids,
-            size=[target_dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-        out, current_state = cell(pre_ids_emb, pre_state, encoder_out,
-                                  encoder_out_proj)
-        prob = fluid.layers.fc(
-            input=current_state, size=target_dict_size, act='softmax')
-
-        # beam search
-        topk_scores, topk_indices = fluid.layers.topk(prob, k=beam_size)
-        accu_scores = fluid.layers.elementwise_add(
-            x=fluid.layers.log(topk_scores),
-            y=fluid.layers.reshape(pre_score, shape=[-1]),
-            axis=0)
-        accu_scores = fluid.layers.lod_reset(x=accu_scores, y=pre_ids)
-        selected_ids, selected_scores = fluid.layers.beam_search(
-            pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=1)
-
-        fluid.layers.increment(x=counter, value=1, in_place=True)
-        # save selected ids and corresponding scores of each step
-        fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
-        fluid.layers.array_write(selected_scores, array=scores_array, i=counter)
-        # update rnn state by sequence_expand acting as gather
-        current_state = fluid.layers.sequence_expand(current_state,
-                                                     selected_ids)
-        fluid.layers.array_write(current_state, array=state_array, i=counter)
-        current_enc_out = fluid.layers.sequence_expand(encoder_out,
-                                                       selected_ids)
-        fluid.layers.assign(current_enc_out, encoder_out)
-        current_enc_out_proj = fluid.layers.sequence_expand(encoder_out_proj,
-                                                            selected_ids)
-        fluid.layers.assign(current_enc_out_proj, encoder_out_proj)
-
-        # update conditional variable
-        length_cond = fluid.layers.less_than(x=counter, y=max_len)
-        finish_cond = fluid.layers.logical_not(
-            fluid.layers.is_empty(x=selected_ids))
-        fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-    translation_ids, translation_scores = fluid.layers.beam_search_decode(
-        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=1)
-
-    return translation_ids, translation_scores
-
-
-def infer_model():
-    encoder_out = encoder()
-    translation_ids, translation_scores = infer_decoder(encoder_out)
-    return translation_ids, translation_scores
-
-
-def infer(use_cuda):
-    infer_prog = fluid.Program()
-    startup_prog = fluid.Program()
-    with fluid.program_guard(infer_prog, startup_prog):
-        with fluid.unique_name.guard():
-            translation_ids, translation_scores = infer_model()
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    test_data = paddle.batch(
-        paddle.dataset.wmt16.test(source_dict_size, target_dict_size),
-        batch_size=batch_size)
-    src_idx2word = paddle.dataset.wmt16.get_dict(
-        "en", source_dict_size, reverse=True)
-    trg_idx2word = paddle.dataset.wmt16.get_dict(
-        "de", target_dict_size, reverse=True)
-
-    fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
-
-    for data in test_data():
-        src_word_id = fluid.create_lod_tensor(
-            data=[x[0] for x in data],
-            recursive_seq_lens=[[len(x[0]) for x in data]],
-            place=place)
-        init_ids = fluid.create_lod_tensor(
-            data=np.array([[0]] * len(data), dtype='int64'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        init_scores = fluid.create_lod_tensor(
-            data=np.array([[0.]] * len(data), dtype='float32'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        seq_ids, seq_scores = exe.run(
-            infer_prog,
-            feed={
-                'src_word_id': src_word_id,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
-            fetch_list=[translation_ids, translation_scores],
-            return_numpy=False)
-        # How to parse the results:
-        #   Suppose the lod of seq_ids is:
-        #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
-        #   then from lod[0]:
-        #     there are 2 source sentences, beam width is 3.
-        #   from lod[1]:
-        #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
-        #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
-        hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
-        scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
-        for i in range(len(seq_ids.lod()[0]) - 1):  # for each source sentence
-            start = seq_ids.lod()[0][i]
-            end = seq_ids.lod()[0][i + 1]
-            print("Original sentence:")
-            print(" ".join([src_idx2word[idx] for idx in data[i][0][1:-1]]))
-            print("Translated score and sentence:")
-            for j in range(end - start):  # for each candidate
-                sub_start = seq_ids.lod()[1][start + j]
-                sub_end = seq_ids.lod()[1][start + j + 1]
-                hyps[i].append(" ".join([
-                    trg_idx2word[idx]
-                    for idx in np.array(seq_ids)[sub_start:sub_end][1:-1]
-                ]))
-                scores[i].append(np.array(seq_scores)[sub_end - 1])
-                print(scores[i][-1], hyps[i][-1].encode('utf8'))
-
-
-def main(use_cuda):
-    train(use_cuda)
-    infer(use_cuda)
-
-
-if __name__ == '__main__':
-    use_cuda = False  # set to True if training with GPU
-    main(use_cuda)
--- a/09.gan/README.cn.md
+++ b/09.gan/README.cn.md
@@ -265,16 +265,16 @@ dg_program = fluid.Program()
 # 定义判别真实图片的program
 with fluid.program_guard(d_program):
    # 输入图片大小为28*28=784
-    img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 784], dtype='float32')
    # 标签shape=1
-    label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='float32')
    d_logit = D(img)
    d_loss = loss(d_logit, label)

 # 定义判别生成图片的program
 with fluid.program_guard(dg_program):
-    noise = fluid.layers.data(
-        name='noise', shape=[NOISE_SIZE], dtype='float32')
+    noise = fluid.data(
+        name='noise', shape=[None, NOISE_SIZE], dtype='float32')
    # 噪声数据作为输入得到生成图片
    g_img = G(x=noise)


--- a/09.gan/dc_gan.py
+++ b/09.gan/dc_gan.py
@@ -60,14 +60,14 @@ def train(args):
    dg_program = fluid.Program()

    with fluid.program_guard(d_program):
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+        img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='float32')
        d_logit = D(img)
        d_loss = loss(d_logit, label)

    with fluid.program_guard(dg_program):
-        noise = fluid.layers.data(
-            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        noise = fluid.data(
+            name='noise', shape=[None, NOISE_SIZE], dtype='float32')
        g_img = G(x=noise)

        g_program = dg_program.clone()

--- a/09.gan/index.cn.html
+++ b/09.gan/index.cn.html
@@ -307,16 +307,16 @@ dg_program = fluid.Program()
 # 定义判别真实图片的program
 with fluid.program_guard(d_program):
    # 输入图片大小为28*28=784
-    img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 784], dtype='float32')
    # 标签shape=1
-    label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='float32')
    d_logit = D(img)
    d_loss = loss(d_logit, label)

 # 定义判别生成图片的program
 with fluid.program_guard(dg_program):
-    noise = fluid.layers.data(
-        name='noise', shape=[NOISE_SIZE], dtype='float32')
+    noise = fluid.data(
+        name='noise', shape=[None, NOISE_SIZE], dtype='float32')
    # 噪声数据作为输入得到生成图片
    g_img = G(x=noise)