Merge branch 'develop' of https://github.com/PaddlePaddle/book into readme

c59d32be · liaogang · 578bc2f3 · e89e682f · c59d32be · c59d32be
16 changed file
--- a/.tools/build_docker.sh
+++ b/.tools/build_docker.sh
@@ -48,7 +48,16 @@ FROM ${paddle_image}:${paddle_tag}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>

 COPY . /book
+EOF
+
+if [ -n ${http_proxy} ]; then
+cat >> Dockerfile <<EOF
+ENV http_proxy ${http_proxy}
+ENV https_proxy ${http_proxy}
+EOF
+fi

+cat >> Dockerfile <<EOF
 RUN pip install -U nltk \
    && python /book/.tools/cache_dataset.py

@@ -58,7 +67,7 @@ RUN ${update_mirror_cmd}
    apt-get -y install gcc && \
    apt-get -y clean && \
    localedef -f UTF-8 -i en_US en_US.UTF-8 && \
-    pip install -U matplotlib jupyter numpy requests scipy
+    pip install -U pillow matplotlib jupyter numpy requests scipy

 #convert md to ipynb
 RUN /book/.tools/notedown.sh

--- a/05.understand_sentiment/README.md
+++ b/05.understand_sentiment/README.md
@@ -141,7 +141,8 @@ import paddle.v2 as paddle
 def convolution_net(input_dim,
                    class_dim=2,
                    emb_dim=128,
-                    hid_dim=128):
+                    hid_dim=128,
+                    is_predict=False):
    data = paddle.layer.data("word",
                             paddle.data_type.integer_value_sequence(input_dim))
    emb = paddle.layer.embedding(input=data, size=emb_dim)
@@ -152,9 +153,12 @@ def convolution_net(input_dim,
    output = paddle.layer.fc(input=[conv_3, conv_4],
                             size=class_dim,
                             act=paddle.activation.Softmax())
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
 ```
 网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。

@@ -165,7 +169,8 @@ def stacked_lstm_net(input_dim,
                     class_dim=2,
                     emb_dim=128,
                     hid_dim=512,
-                     stacked_num=3):
+                     stacked_num=3,
+                     is_predict=False):
    """
    A Wrapper for sentiment classification task.
    This network uses bi-directional recurrent network,
@@ -223,9 +228,12 @@ def stacked_lstm_net(input_dim,
                             bias_attr=bias_attr,
                             param_attr=para_attr)

-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
 ```
 网络的输入`stacked_num`表示的是LSTM的层数，需要是奇数，确保最高层LSTM正向。Paddle里面是通过一个fc和一个lstmemory来实现基于LSTM的循环神经网络。

@@ -294,7 +302,7 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。

 ### 训练

-可以通过`paddle.trainer.SGD`构造一个sgd trainer，并调用`trainer.train`来训练模型。
+可以通过`paddle.trainer.SGD`构造一个sgd trainer，并调用`trainer.train`来训练模型。另外，通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。
 ```python
    # End batch and end pass event handler
    def event_handler(event):
@@ -309,7 +317,21 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
            result = trainer.test(reader=test_reader, feeding=feeding)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
-可以通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。比如构造如下一个`event_handler`可以在每100个batch结束后输出cost和error；在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
+比如，构造如下一个`event_handler`可以在每100个batch结束后输出cost和error；在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
+```python
+    from paddle.v2.plot import Ploter
+
+    train_title = "Train cost"
+    cost_ploter = Ploter(train_title)
+    step = 0
+    def event_handler_plot(event):
+        global step
+        if isinstance(event, paddle.event.EndIteration):
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+            step += 1
+```
+或者构造一个`event_handler_plot`画出cost曲线。
 ```python
    # create trainer
    trainer = paddle.trainer.SGD(cost=cost,
@@ -331,6 +353,36 @@ Pass 0, Batch 100, Cost 0.294321, {'classification_error_evaluator': 0.1015625}
 Test with Pass 0, {'classification_error_evaluator': 0.11432000249624252}
 ```

+## 应用模型
+
+可以使用训练好的模型对电影评论进行分类，下面程序展示了如何使用`paddle.infer`接口进行推断。
+```python
+    import numpy as np
+
+    # Movie Reviews, from imdb test
+    reviews = [
+        'Read the book, forget the movie!',
+        'This is a great movie.'
+    ]
+    reviews = [c.split() for c in reviews]
+
+    UNK = word_dict['<unk>']
+    input = []
+    for c in reviews:
+        input.append([[word_dict.get(words, UNK) for words in c]])
+
+    # 0 stands for positive sample, 1 stands for negative sample
+    label = {0:'pos', 1:'neg'}
+    # Use the network used by trainer
+    out = convolution_net(dict_dim, class_dim=class_dim, is_predict=True)
+    # out = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3, is_predict=True)
+    probs = paddle.infer(output_layer=out, parameters=parameters, input=input)
+
+    labs = np.argsort(-probs)
+    for idx, lab in enumerate(labs):
+        print idx, "predicting probability is", probs[idx], "label is", label[lab[0]]
+```
+
 ## 总结

 本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。

--- a/05.understand_sentiment/index.html
+++ b/05.understand_sentiment/index.html
@@ -183,7 +183,8 @@ import paddle.v2 as paddle
 def convolution_net(input_dim,
                    class_dim=2,
                    emb_dim=128,
-                    hid_dim=128):
+                    hid_dim=128,
+                    is_predict=False):
    data = paddle.layer.data("word",
                             paddle.data_type.integer_value_sequence(input_dim))
    emb = paddle.layer.embedding(input=data, size=emb_dim)
@@ -194,9 +195,12 @@ def convolution_net(input_dim,
    output = paddle.layer.fc(input=[conv_3, conv_4],
                             size=class_dim,
                             act=paddle.activation.Softmax())
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
 ```
 网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。

@@ -207,7 +211,8 @@ def stacked_lstm_net(input_dim,
                     class_dim=2,
                     emb_dim=128,
                     hid_dim=512,
-                     stacked_num=3):
+                     stacked_num=3,
+                     is_predict=False):
    """
    A Wrapper for sentiment classification task.
    This network uses bi-directional recurrent network,
@@ -265,9 +270,12 @@ def stacked_lstm_net(input_dim,
                             bias_attr=bias_attr,
                             param_attr=para_attr)

-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
 ```
 网络的输入`stacked_num`表示的是LSTM的层数，需要是奇数，确保最高层LSTM正向。Paddle里面是通过一个fc和一个lstmemory来实现基于LSTM的循环神经网络。

@@ -336,7 +344,7 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。

 ### 训练

-可以通过`paddle.trainer.SGD`构造一个sgd trainer，并调用`trainer.train`来训练模型。
+可以通过`paddle.trainer.SGD`构造一个sgd trainer，并调用`trainer.train`来训练模型。另外，通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。
 ```python
    # End batch and end pass event handler
    def event_handler(event):
@@ -351,7 +359,21 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
            result = trainer.test(reader=test_reader, feeding=feeding)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
-可以通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。比如构造如下一个`event_handler`可以在每100个batch结束后输出cost和error；在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
+比如，构造如下一个`event_handler`可以在每100个batch结束后输出cost和error；在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
+```python
+    from paddle.v2.plot import Ploter
+
+    train_title = "Train cost"
+    cost_ploter = Ploter(train_title)
+    step = 0
+    def event_handler_plot(event):
+        global step
+        if isinstance(event, paddle.event.EndIteration):
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+            step += 1
+```
+或者构造一个`event_handler_plot`画出cost曲线。
 ```python
    # create trainer
    trainer = paddle.trainer.SGD(cost=cost,
@@ -373,6 +395,36 @@ Pass 0, Batch 100, Cost 0.294321, {'classification_error_evaluator': 0.1015625}
 Test with Pass 0, {'classification_error_evaluator': 0.11432000249624252}
 ```

+## 应用模型
+
+可以使用训练好的模型对电影评论进行分类，下面程序展示了如何使用`paddle.infer`接口进行推断。
+```python
+    import numpy as np
+
+    # Movie Reviews, from imdb test
+    reviews = [
+        'Read the book, forget the movie!',
+        'This is a great movie.'
+    ]
+    reviews = [c.split() for c in reviews]
+
+    UNK = word_dict['<unk>']
+    input = []
+    for c in reviews:
+        input.append([[word_dict.get(words, UNK) for words in c]])
+
+    # 0 stands for positive sample, 1 stands for negative sample
+    label = {0:'pos', 1:'neg'}
+    # Use the network used by trainer
+    out = convolution_net(dict_dim, class_dim=class_dim, is_predict=True)
+    # out = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3, is_predict=True)
+    probs = paddle.infer(output_layer=out, parameters=parameters, input=input)
+
+    labs = np.argsort(-probs)
+    for idx, lab in enumerate(labs):
+        print idx, "predicting probability is", probs[idx], "label is", label[lab[0]]
+```
+
 ## 总结

 本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。

--- a/06.label_semantic_roles/README.en.md
+++ b/06.label_semantic_roles/README.en.md
@@ -214,6 +214,7 @@ import numpy as np
 import gzip
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator

 paddle.init(use_gpu=False, trainer_count=1)

@@ -343,23 +344,23 @@ for i in range(1, depth):
    input_tmp = [mix_hidden, lstm]
 ```

- We will concatenate the output of the top LSTM unit with its input, and project the result into a hidden layer. Then, we put a fully connected layer on top to get the final feature vector representation.
+- In PaddlePaddle, state features and transition features of a CRF are implemented by a fully connected layer and a CRF layer seperately. The fully connected layer with linear activation learns the state features, here we use paddle.layer.mixed (paddle.layer.fc can be uesed as well), and the CRF layer in PaddlePaddle: paddle.layer.crf only learns the transition features, which is a cost layer and is the last layer of the network. paddle.layer.crf outputs the log probability of true tag sequence as the cost by given the input sequence and it requires the true tag sequence as target in the learning process.

- ```python
- feature_out = paddle.layer.mixed(
- size=label_dict_len,
- bias_attr=std_default,
- input=[
-     paddle.layer.full_matrix_projection(
-         input=input_tmp[0], param_attr=hidden_para_attr),
-     paddle.layer.full_matrix_projection(
-         input=input_tmp[1], param_attr=lstm_para_attr)
- ], )
- ```
+```python

- At the end of the network, we use CRF as the cost function; the parameter of CRF cost will be named `crfw`.
+# The output of the top LSTM unit and its input are feed into a fully connected layer,
+# size of which equals to size of tag labels.
+# The fully connected layer learns the state features
+
+feature_out = paddle.layer.mixed(
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)], )

-```python
 crf_cost = paddle.layer.crf(
    size=label_dict_len,
    input=feature_out,
@@ -370,7 +371,7 @@ crf_cost = paddle.layer.crf(
        learning_rate=mix_hidden_lr))
 ```

- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer.  The sharing of parameters among multiple layers is specified by using the same parameter name in these layers.
+- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer.  The sharing of parameters among multiple layers is specified by using the same parameter name in these layers. If true tag sequence is provided in training process, `paddle.layer.crf_decoding` calculates labelling error for each input token and `evaluator.sum` sum the error over the entire sequence. Otherwise, `paddle.layer.crf_decoding`  generates the labelling tags.

 ```python
 crf_dec = paddle.layer.crf_decoding(
@@ -414,7 +415,7 @@ We will create trainer given model topology, parameters, and optimization method
 ```python
 optimizer = paddle.optimizer.Momentum(
    momentum=0,
-    learning_rate=2e-2,
+    learning_rate=1e-3,
    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
    model_average=paddle.optimizer.ModelAverage(
        average_window=0.5, max_average_window=10000), )
@@ -432,7 +433,7 @@ As mentioned in data preparation section, we will use CoNLL 2005 test corpus as
 ```python
 reader = paddle.batch(
    paddle.reader.shuffle(
-        conll05.test(), buf_size=8192), batch_size=20)
+        conll05.test(), buf_size=8192), batch_size=2)
 ```

 `feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
@@ -456,17 +457,17 @@ feeding = {
 ```python
 def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
+        if event.batch_id and event.batch_id % 10 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
                event.pass_id, event.batch_id, event.cost, event.metrics)
-        if event.batch_id % 1000 == 0:
+        if event.batch_id % 400 == 0:
            result = trainer.test(reader=reader, feeding=feeding)
            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)

    if isinstance(event, paddle.event.EndPass):
        # save parameters
        with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-            parameters.to_tar(f)  
+            parameters.to_tar(f)

        result = trainer.test(reader=reader, feeding=feeding)
        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)

--- a/06.label_semantic_roles/README.md
+++ b/06.label_semantic_roles/README.md
@@ -192,6 +192,7 @@ import numpy as np
 import gzip
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator

 paddle.init(use_gpu=False, trainer_count=1)

@@ -274,12 +275,12 @@ emb_layers.append(mark_embedding)

 ```python  
 hidden_0 = paddle.layer.mixed(
-size=hidden_dim,
-bias_attr=std_default,
-input=[
-    paddle.layer.full_matrix_projection(
-        input=emb, param_attr=std_default) for emb in emb_layers
-])
+    size=hidden_dim,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])

 mix_hidden_lr = 1e-3
 lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
@@ -320,23 +321,24 @@ for i in range(1, depth):
    input_tmp = [mix_hidden, lstm]
 ```

- 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
+- 在PaddlePaddle中，CRF的状态特征和转移特征分别由一个全连接层和一个PaddlePaddle中的CRF层分别学习。在这个例子中，我们用线性激活的paddle.layer.mixed 来学习CRF的状态特征（也可以使用paddle.layer.fc），而 paddle.layer.crf只学习转移特征。paddle.layer.crf层是一个 cost 层，处于整个网络的末端，输出给定输入序列下，标记序列的log probability作为代价。训练阶段，该层需要输入正确的标记序列作为学习目标。

 ```python
-feature_out = paddle.layer.mixed(
-size=label_dict_len,
-bias_attr=std_default,
-input=[
-    paddle.layer.full_matrix_projection(
-        input=input_tmp[0], param_attr=hidden_para_attr),
-    paddle.layer.full_matrix_projection(
-        input=input_tmp[1], param_attr=lstm_para_attr)
-], )
-```

- 网络的末端定义CRF层计算损失(cost)，指定参数名字为 `crfw`，该层需要输入正确的数据标签(target)。
+# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，
+# 经过一个全连接层映射到标记字典的维度，来学习 CRF 的状态特征

-```python
+feature_out = paddle.layer.mixed(
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)
+    ], )
+
+# 学习 CRF 的转移特征
 crf_cost = paddle.layer.crf(
    size=label_dict_len,
    input=feature_out,
@@ -347,7 +349,7 @@ crf_cost = paddle.layer.crf(
        learning_rate=mix_hidden_lr))
 ```

- CRF译码层和CRF层参数名字相同，即共享权重。如果输入了正确的数据标签(target)，会统计错误标签的个数，可以用来评估模型。如果没有输入正确的数据标签，该层可以推到出最优解，可以用来预测模型。
+- CRF解码和CRF层参数名字相同，即：加载了`paddle.layer.crf`层学习到的参数。在训练阶段，为`paddle.layer.crf_decoding` 输入了正确的标记序列(target)，这一层会输出是否正确标记，`evaluator.sum` 用来计算序列上的标记错误率，可以用来评估模型。解码阶段，没有输入正确的数据标签，该层通过寻找概率最高的标记序列，解码出标记结果。

 ```python
 crf_dec = paddle.layer.crf_decoding(
@@ -394,7 +396,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
 # create optimizer
 optimizer = paddle.optimizer.Momentum(
    momentum=0,
-    learning_rate=2e-2,
+    learning_rate=1e-3,
    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
    model_average=paddle.optimizer.ModelAverage(
        average_window=0.5, max_average_window=10000), )
@@ -412,7 +414,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
 ```python
 reader = paddle.batch(
    paddle.reader.shuffle(
-        conll05.test(), buf_size=8192), batch_size=20)
+        conll05.test(), buf_size=8192), batch_size=2)
 ```

 通过`feeding`来指定每一个数据和data_layer的对应关系。 例如 下面`feeding`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。
@@ -437,17 +439,17 @@ feeding = {
 ```python
 def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
+        if event.batch_id and event.batch_id % 10 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
                event.pass_id, event.batch_id, event.cost, event.metrics)
-        if event.batch_id % 1000 == 0:
+        if event.batch_id % 400 == 0:
            result = trainer.test(reader=reader, feeding=feeding)
            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)

    if isinstance(event, paddle.event.EndPass):
        # save parameters
        with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-            parameters.to_tar(f)  
+            parameters.to_tar(f)

        result = trainer.test(reader=reader, feeding=feeding)
        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
@@ -459,7 +461,7 @@ def event_handler(event):
 trainer.train(
    reader=reader,
    event_handler=event_handler,
-    num_passes=10000,
+    num_passes=1,
    feeding=feeding)
 ```


--- a/06.label_semantic_roles/index.en.html
+++ b/06.label_semantic_roles/index.en.html
@@ -256,6 +256,7 @@ import numpy as np
 import gzip
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator

 paddle.init(use_gpu=False, trainer_count=1)

@@ -385,23 +386,23 @@ for i in range(1, depth):
    input_tmp = [mix_hidden, lstm]
 ```

- We will concatenate the output of the top LSTM unit with its input, and project the result into a hidden layer. Then, we put a fully connected layer on top to get the final feature vector representation.
+- In PaddlePaddle, state features and transition features of a CRF are implemented by a fully connected layer and a CRF layer seperately. The fully connected layer with linear activation learns the state features, here we use paddle.layer.mixed (paddle.layer.fc can be uesed as well), and the CRF layer in PaddlePaddle: paddle.layer.crf only learns the transition features, which is a cost layer and is the last layer of the network. paddle.layer.crf outputs the log probability of true tag sequence as the cost by given the input sequence and it requires the true tag sequence as target in the learning process.

- ```python
- feature_out = paddle.layer.mixed(
- size=label_dict_len,
- bias_attr=std_default,
- input=[
-     paddle.layer.full_matrix_projection(
-         input=input_tmp[0], param_attr=hidden_para_attr),
-     paddle.layer.full_matrix_projection(
-         input=input_tmp[1], param_attr=lstm_para_attr)
- ], )
- ```
+```python

- At the end of the network, we use CRF as the cost function; the parameter of CRF cost will be named `crfw`.
+# The output of the top LSTM unit and its input are feed into a fully connected layer,
+# size of which equals to size of tag labels.
+# The fully connected layer learns the state features
+
+feature_out = paddle.layer.mixed(
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)], )

-```python
 crf_cost = paddle.layer.crf(
    size=label_dict_len,
    input=feature_out,
@@ -412,7 +413,7 @@ crf_cost = paddle.layer.crf(
        learning_rate=mix_hidden_lr))
 ```

- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer.  The sharing of parameters among multiple layers is specified by using the same parameter name in these layers.
+- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer.  The sharing of parameters among multiple layers is specified by using the same parameter name in these layers. If true tag sequence is provided in training process, `paddle.layer.crf_decoding` calculates labelling error for each input token and `evaluator.sum` sum the error over the entire sequence. Otherwise, `paddle.layer.crf_decoding`  generates the labelling tags.

 ```python
 crf_dec = paddle.layer.crf_decoding(
@@ -456,7 +457,7 @@ We will create trainer given model topology, parameters, and optimization method
 ```python
 optimizer = paddle.optimizer.Momentum(
    momentum=0,
-    learning_rate=2e-2,
+    learning_rate=1e-3,
    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
    model_average=paddle.optimizer.ModelAverage(
        average_window=0.5, max_average_window=10000), )
@@ -474,7 +475,7 @@ As mentioned in data preparation section, we will use CoNLL 2005 test corpus as
 ```python
 reader = paddle.batch(
    paddle.reader.shuffle(
-        conll05.test(), buf_size=8192), batch_size=20)
+        conll05.test(), buf_size=8192), batch_size=2)
 ```

 `feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
@@ -498,17 +499,17 @@ feeding = {
 ```python
 def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
+        if event.batch_id and event.batch_id % 10 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
                event.pass_id, event.batch_id, event.cost, event.metrics)
-        if event.batch_id % 1000 == 0:
+        if event.batch_id % 400 == 0:
            result = trainer.test(reader=reader, feeding=feeding)
            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)

    if isinstance(event, paddle.event.EndPass):
        # save parameters
        with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-            parameters.to_tar(f)  
+            parameters.to_tar(f)

        result = trainer.test(reader=reader, feeding=feeding)
        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)

--- a/06.label_semantic_roles/index.html
+++ b/06.label_semantic_roles/index.html
@@ -234,6 +234,7 @@ import numpy as np
 import gzip
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator

 paddle.init(use_gpu=False, trainer_count=1)

@@ -316,12 +317,12 @@ emb_layers.append(mark_embedding)

 ```python  
 hidden_0 = paddle.layer.mixed(
-size=hidden_dim,
-bias_attr=std_default,
-input=[
-    paddle.layer.full_matrix_projection(
-        input=emb, param_attr=std_default) for emb in emb_layers
-])
+    size=hidden_dim,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])

 mix_hidden_lr = 1e-3
 lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
@@ -362,23 +363,24 @@ for i in range(1, depth):
    input_tmp = [mix_hidden, lstm]
 ```

- 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
+- 在PaddlePaddle中，CRF的状态特征和转移特征分别由一个全连接层和一个PaddlePaddle中的CRF层分别学习。在这个例子中，我们用线性激活的paddle.layer.mixed 来学习CRF的状态特征（也可以使用paddle.layer.fc），而 paddle.layer.crf只学习转移特征。paddle.layer.crf层是一个 cost 层，处于整个网络的末端，输出给定输入序列下，标记序列的log probability作为代价。训练阶段，该层需要输入正确的标记序列作为学习目标。

 ```python
-feature_out = paddle.layer.mixed(
-size=label_dict_len,
-bias_attr=std_default,
-input=[
-    paddle.layer.full_matrix_projection(
-        input=input_tmp[0], param_attr=hidden_para_attr),
-    paddle.layer.full_matrix_projection(
-        input=input_tmp[1], param_attr=lstm_para_attr)
-], )
-```

- 网络的末端定义CRF层计算损失(cost)，指定参数名字为 `crfw`，该层需要输入正确的数据标签(target)。
+# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，
+# 经过一个全连接层映射到标记字典的维度，来学习 CRF 的状态特征

-```python
+feature_out = paddle.layer.mixed(
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)
+    ], )
+
+# 学习 CRF 的转移特征
 crf_cost = paddle.layer.crf(
    size=label_dict_len,
    input=feature_out,
@@ -389,7 +391,7 @@ crf_cost = paddle.layer.crf(
        learning_rate=mix_hidden_lr))
 ```

- CRF译码层和CRF层参数名字相同，即共享权重。如果输入了正确的数据标签(target)，会统计错误标签的个数，可以用来评估模型。如果没有输入正确的数据标签，该层可以推到出最优解，可以用来预测模型。
+- CRF解码和CRF层参数名字相同，即：加载了`paddle.layer.crf`层学习到的参数。在训练阶段，为`paddle.layer.crf_decoding` 输入了正确的标记序列(target)，这一层会输出是否正确标记，`evaluator.sum` 用来计算序列上的标记错误率，可以用来评估模型。解码阶段，没有输入正确的数据标签，该层通过寻找概率最高的标记序列，解码出标记结果。

 ```python
 crf_dec = paddle.layer.crf_decoding(
@@ -436,7 +438,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
 # create optimizer
 optimizer = paddle.optimizer.Momentum(
    momentum=0,
-    learning_rate=2e-2,
+    learning_rate=1e-3,
    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
    model_average=paddle.optimizer.ModelAverage(
        average_window=0.5, max_average_window=10000), )
@@ -454,7 +456,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
 ```python
 reader = paddle.batch(
    paddle.reader.shuffle(
-        conll05.test(), buf_size=8192), batch_size=20)
+        conll05.test(), buf_size=8192), batch_size=2)
 ```

 通过`feeding`来指定每一个数据和data_layer的对应关系。 例如 下面`feeding`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。
@@ -479,17 +481,17 @@ feeding = {
 ```python
 def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
+        if event.batch_id and event.batch_id % 10 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
                event.pass_id, event.batch_id, event.cost, event.metrics)
-        if event.batch_id % 1000 == 0:
+        if event.batch_id % 400 == 0:
            result = trainer.test(reader=reader, feeding=feeding)
            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)

    if isinstance(event, paddle.event.EndPass):
        # save parameters
        with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-            parameters.to_tar(f)  
+            parameters.to_tar(f)

        result = trainer.test(reader=reader, feeding=feeding)
        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
@@ -501,7 +503,7 @@ def event_handler(event):
 trainer.train(
    reader=reader,
    event_handler=event_handler,
-    num_passes=10000,
+    num_passes=1,
    feeding=feeding)
 ```


--- a/07.machine_translation/README.en.md
+++ b/07.machine_translation/README.en.md
--- a/07.machine_translation/README.md
+++ b/07.machine_translation/README.md
--- a/07.machine_translation/data/wmt14_data.sh
+++ b/07.machine_translation/data/wmt14_data.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-mkdir wmt14
-cd wmt14
-
-# download the dataset
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz
-
-# untar the dataset
-tar -zxvf bitexts.tgz
-tar -zxvf dev+test.tgz
-gunzip bitexts.selected/*
-mv bitexts.selected train
-rm bitexts.tgz
-rm dev+test.tgz
-
-# separate the dev and test dataset
-mkdir test gen
-mv dev/ntst1213.* test
-mv dev/ntst14.* gen
-rm -rf dev
-
-set +x
-# rename the suffix, .fr->.src, .en->.trg
-for dir in train test gen
-do
-  filelist=`ls $dir`
-  cd $dir
-  for file in $filelist
-  do
-    if [ ${file##*.} = "fr" ]; then
-      mv $file ${file/%fr/src}
-    elif [ ${file##*.} = 'en' ]; then
-      mv $file ${file/%en/trg}
-    fi
-  done
-  cd ..
-done
--- a/07.machine_translation/eval_bleu.sh
+++ b/07.machine_translation/eval_bleu.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-gen_file=$1
-beam_size=$2
-
-# find top1 generating result
-top1=$(printf '%s_top1.txt' `basename $gen_file .txt`)
-if [ $beam_size -eq 1 ]; then
-    awk -F "\t" '{sub(" <e>","",$2);sub(" ","",$2);print $2}' $gen_file >$top1
-else
-    awk 'BEGIN{
-        FS="\t";
-        OFS="\t";
-        read_pos = 2} {
-        if (NR == read_pos){
-            sub(" <e>","",$3);
-            sub(" ","",$3);
-            print $3;
-            read_pos += (2 + res_num);
-      }}' res_num=$beam_size $gen_file >$top1
-fi
-
-# evalute bleu value
-bleu_script=multi-bleu.perl
-standard_res=data/wmt14/gen/ntst14.trg
-bleu_res=`perl $bleu_script $standard_res <$top1`
-
-echo $bleu_res | cut -d, -f 1
-rm $top1
--- a/07.machine_translation/index.en.html
+++ b/07.machine_translation/index.en.html
--- a/07.machine_translation/index.html
+++ b/07.machine_translation/index.html
--- a/07.machine_translation/moses_bleu.sh
+++ b/07.machine_translation/moses_bleu.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-echo "Downloading multi-bleu.perl"
-wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate
--- a/07.machine_translation/pretrained/wmt14_model.sh
+++ b/07.machine_translation/pretrained/wmt14_model.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-
-# download the pretrained model
-wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
-
-# untar the model
-tar -zxvf wmt14_model.tar.gz
-rm wmt14_model.tar.gz
--- a/07.machine_translation/train.py
+++ b/07.machine_translation/train.py
 import sys
+
 import paddle.v2 as paddle


-def seqToseq_net(source_dict_dim, target_dict_dim):
+def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
    ### Network Architecture
    word_vector_dim = 512  # dimension of word vector
    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
    encoder_size = 512  # dimension of hidden unit in GRU Encoder network

+    beam_size = 3
+    max_length = 250
+
    #### Encoder
    src_word_id = paddle.layer.data(
        name='source_language_word',
@@ -67,78 +71,142 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]

-    trg_embedding = paddle.layer.embedding(
-        input=paddle.layer.data(
-            name='target_language_word',
-            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
-    group_inputs.append(trg_embedding)
-
-    # For decoder equipped with attention mechanism, in training,
-    # target embeding (the groudtruth) is the data input,
-    # while encoded source sequence is accessed to as an unbounded memory.
-    # Here, the StaticInput defines a read-only memory
-    # for the recurrent_group.
-    decoder = paddle.layer.recurrent_group(
-        name=decoder_group_name,
-        step=gru_decoder_with_attention,
-        input=group_inputs)
-
-    lbl = paddle.layer.data(
-        name='target_language_next_word',
-        type=paddle.data_type.integer_value_sequence(target_dict_dim))
-    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
-
-    return cost
+    if not is_generating:
+        trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+
+        # For decoder equipped with attention mechanism, in training,
+        # target embeding (the groudtruth) is the data input,
+        # while encoded source sequence is accessed to as an unbounded memory.
+        # Here, the StaticInput defines a read-only memory
+        # for the recurrent_group.
+        decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+        lbl = paddle.layer.data(
+            name='target_language_next_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim))
+        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+        return cost
+    else:
+        # In generation, the decoder predicts a next target word based on
+        # the encoded source sequence and the last generated target word.
+
+        # The encoded source sequence (encoder's output) must be specified by
+        # StaticInput, which is a read-only memory.
+        # Embedding of the last generated word is automatically gotten by
+        # GeneratedInputs, which is initialized by a start mark, such as <s>,
+        # and must be included in generation.
+
+        trg_embedding = paddle.layer.GeneratedInputV2(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+        group_inputs.append(trg_embedding)
+
+        beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+
+        return beam_gen


 def main():
    paddle.init(use_gpu=False, trainer_count=1)
+    is_generating = False

    # source and target dict dim.
    dict_size = 30000
    source_dict_dim = target_dict_dim = dict_size

-    # define network topology
-    cost = seqToseq_net(source_dict_dim, target_dict_dim)
-    parameters = paddle.parameters.create(cost)
-
-    # define optimize method and trainer
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=5e-5,
-        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
-    trainer = paddle.trainer.SGD(
-        cost=cost, parameters=parameters, update_equation=optimizer)
-
-    # define data reader
-    feeding = {
-        'source_language_word': 0,
-        'target_language_word': 1,
-        'target_language_next_word': 2
-    }
-
-    wmt14_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
-        batch_size=5)
-
-    # define event_handler callback
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 10 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
+    # train the network
+    if not is_generating:
+        cost = seqToseq_net(source_dict_dim, target_dict_dim)
+        parameters = paddle.parameters.create(cost)
+
+        # define optimize method and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(
+            cost=cost, parameters=parameters, update_equation=optimizer)
+        # define data reader
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
+            batch_size=5)
+
+        # define event_handler callback
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 10 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost,
+                        event.metrics)
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+
+        # start to train
+        trainer.train(
+            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+
+    # generate a english sequence to french
+    else:
+        # use the first 3 samples for generation
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+
+        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
+        # get the pretrained model, whose bleu = 26.92
+        parameters = paddle.dataset.wmt14.model()
+        # prob is the prediction probabilities, and id is the prediction word. 
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+
+        # get the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+        # the delimited element of generated sequences is -1,
+        # the first element of each generated sequence is the sequence length
+        seq_list = []
+        seq = []
+        for w in beam_result[1]:
+            if w != -1:
+                seq.append(w)
            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-
-    # start to train
-    trainer.train(
-        reader=wmt14_reader,
-        event_handler=event_handler,
-        num_passes=2,
-        feeding=feeding)
+                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
+                seq = []
+
+        prob = beam_result[0]
+        beam_size = 3
+        for i in xrange(gen_num):
+            print "\n*******************************************************\n"
+            print "src:", ' '.join(
+                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+            for j in xrange(beam_size):
+                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]


 if __name__ == '__main__':