From 0a31f10996afd2b1c5e8a3df5f80910f38aa1910 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Mon, 18 Feb 2019 00:18:14 +0800
Subject: [PATCH] Refine the cn doc of srl

---
 07.label_semantic_roles/README.cn.md  | 168 ++++++++++----------------
 07.label_semantic_roles/index.cn.html | 168 ++++++++++----------------
 07.label_semantic_roles/train.py      |   2 +-
 3 files changed, 123 insertions(+), 215 deletions(-)

diff --git a/07.label_semantic_roles/README.cn.md b/07.label_semantic_roles/README.cn.md
index 7faa20a..590e319 100644
--- a/07.label_semantic_roles/README.cn.md
+++ b/07.label_semantic_roles/README.cn.md
@@ -151,7 +151,7 @@ conll05st-release/
 4. 构造以BIO法表示的标记；
 5. 依据词典获取词对应的整数索引。
 
-预处理完成之后一条训练样本包含9个特征，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
+预处理完成之后一条训练样本数据包含9个域，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
 
 | 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 |
 |---|---|---|---|---|
@@ -206,33 +206,34 @@ print('pred_dict_len: ', pred_dict_len)
 - 定义输入数据维度及模型超参数。
 
 ```python
-mark_dict_len = 2   # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
-word_dim = 32       # 词向量维度
-mark_dim = 5        # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
-hidden_dim = 512    # LSTM隐层向量的维度 ： 512 / 4
-depth = 8           # 栈式LSTM的深度
-mix_hidden_lr = 1e-3
+mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
+word_dim = 32        # 词向量维度
+mark_dim = 5         # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
+hidden_dim = 512     # LSTM隐层向量的维度 ： 512 / 4
+depth = 8            # 栈式LSTM的深度
+mix_hidden_lr = 1e-3 # linear_chain_crf层的基础学习率
 
-IS_SPARSE = True
-PASS_NUM = 10
-BATCH_SIZE = 10
+IS_SPARSE = True     # 是否以稀疏方式更新embedding
+PASS_NUM = 10        # 训练轮数
+BATCH_SIZE = 10      # batch size 大小
 
 embedding_name = 'emb'
 ```
 
-这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
+这里需要特别说明的是，参数 `hidden_dim = 512` 实际指定了LSTM隐层向量的维度为128，关于这一点请参考PaddlePaddle官方文档中[dynamic_lstm](http://www.paddlepaddle.org/documentation/docs/zh/1.2/api_cn/layers_cn.html#dynamic-lstm)的说明。
 
 - 如上文提到，我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数，在训练中不更新。
 
 ```python
-# 这里加载PaddlePaddle上版保存的二进制模型
+# 这里加载PaddlePaddle保存的二进制参数
 def load_parameter(file_name, h, w):
     with open(file_name, 'rb') as f:
         f.read(16)  # skip header.
         return np.fromfile(f, dtype=np.float32).reshape(h, w)
 ```
 
-- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习，主要的执行逻辑如下：
+ 1）为不同的输入特征分别定义embedding层
 
 ```python  
 def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
@@ -252,8 +253,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         is_sparse=IS_SPARSE)
 
     word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    # Since word vector lookup table is pre-trained, we won't update it this time.
-    # trainable being False prevents updating the lookup table during training.
+    # 因词向量是预训练好的，这里不再训练embedding表，
+    # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新
     emb_layers = [
         fluid.layers.embedding(
             size=[word_dict_len, word_dim],
@@ -263,9 +264,12 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
+```
+2) 定义深度双向LSTM结构
 
-    # 8 LSTM units are trained through alternating left-to-right / right-to-left order
-    # denoted by the variable `reverse`.
+```python
+    # 共有8个LSTM单元被训练，每个单元的方向为从左到右或从右到左，
+    # 由参数`is_reverse`确定
     hidden_0_layers = [
         fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
         for emb in emb_layers
@@ -280,19 +284,9 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         gate_activation='sigmoid',
         cell_activation='sigmoid')
 
-    # stack L-LSTM and R-LSTM with direct edges
+    # 用直连的边来堆叠L-LSTM、R-LSTM
     input_tmp = [hidden_0, lstm_0]
 
-    # In PaddlePaddle, state features and transition features of a CRF are implemented
-    # by a fully connected layer and a CRF layer seperately. The fully connected layer
-    # with linear activation learns the state features, here we use fluid.layers.sums
-    # (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle:
-    # fluid.layers.linear_chain_crf only
-    # learns the transition features, which is a cost layer and is the last layer of the network.
-    # fluid.layers.linear_chain_crf outputs the log probability of true tag sequence
-    # as the cost by given the input sequence and it requires the true tag sequence
-    # as target in the learning process.
-
     for i in range(1, depth):
         mix_hidden = fluid.layers.sums(input=[
             fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
@@ -323,55 +317,14 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
 - 我们根据网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法(momentum设置为0)，同时设定了学习率、正则等。
 
-- 数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
-
-- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。
-
-- 可以使用event_handler回调函数来观察训练过程，或进行测试等。这里我们打印了训练过程的cost，该回调函数是trainer.train函数里设定。
-
-- 通过trainer.train函数训练
-
 ```python
-def train(use_cuda, save_dirname=None, is_local=True):
-    # define network topology
-
-    # 句子序列
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 谓词
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 谓词上下文5个特征
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 谓词上下区域标志
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-    # define network topology
     feature_out = db_lstm(**locals())
-
-    # 标注序列
     target = fluid.layers.data(
         name='target', shape=[1], dtype='int64', lod_level=1)
-
-    # 学习 CRF 的转移特征
     crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
+        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr))
 
     avg_cost = fluid.layers.mean(crf_cost)
 
@@ -383,31 +336,29 @@ def train(use_cuda, save_dirname=None, is_local=True):
             staircase=True))
 
     sgd_optimizer.minimize(avg_cost)
+```
 
-    # The CRF decoding layer is used for evaluation and inference.
-    # It shares weights with CRF layer.  The sharing of parameters among multiple layers
-    # is specified by using the same parameter name in these layers. If true tag sequence
-    # is provided in training process, `fluid.layers.crf_decoding` calculates labelling error
-    # for each input token and sums the error over the entire sequence.
-    # Otherwise, `fluid.layers.crf_decoding`  generates the labelling tags.
-    crf_decode = fluid.layers.crf_decoding(
-        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+- 数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
 
+```python
     train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
+        paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
+```
 
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
+- 通过feeding来指定每一个数据和data_layer的对应关系, 下面的feeding表示 conll05.test()产生数据的第0列对应的data_layer是`word`
 
+```python
     feeder = fluid.DataFeeder(
         feed_list=[
             word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
         ],
         place=place)
-    exe = fluid.Executor(place)
+```
+
+- 最后定义`train_loop()`函数来控制训练过程，并执行`train_loop()`函数
 
+```python
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
         embedding_param = fluid.global_scope().find_var(
@@ -420,19 +371,19 @@ def train(use_cuda, save_dirname=None, is_local=True):
         batch_id = 0
         for pass_id in six.moves.xrange(PASS_NUM):
             for data in train_data():
-                cost = exe.run(main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost])
+                cost = exe.run(
+                    main_program, feed=feeder.feed(data), fetch_list=[avg_cost])
                 cost = cost[0]
 
                 if batch_id % 10 == 0:
-                    print("avg_cost: " + str(cost))
+                    print("avg_cost:" + str(cost))
                     if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
+                        print("second per batch: " + str((
+                            time.time() - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
                     if float(cost) < 60.0:
                         if save_dirname is not None:
+                            # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [
                                 'word_data', 'verb_data', 'ctx_n2_data',
                                 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
@@ -448,7 +399,9 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
 ## 应用模型
 
-训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。
+训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例
+
+- 加载inference model
 
 ```python
 def infer(use_cuda, save_dirname=None):
@@ -460,26 +413,23 @@ def infer(use_cuda, save_dirname=None):
 
     inference_scope = fluid.core.Scope()
     with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be fed
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
+        # 使用fluid.io.load_inference_model加载inference_program，
+        # feed_target_names是模型的输入变量的名称，fetch_targets是预测对象
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+```
 
-        # Setup inputs by creating LoDTensors to represent sequences of words.
-        # Here each word is the basic element of these LoDTensors and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to
-        # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensors will have only
-        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for three sentences of
-        # length 3, 4 and 2, respectively.
-        # Note that lod info should be a list of lists.
+- 输入数据，这里构造假数据作为输入
+
+```python
+        # 设置输入，用LoDTensor来表示输入的词序列，这里每个词的形状
+        # base_shape都是[1]，是因为每个词都是用一个id来表示的。
+        # 假如基于长度的LoD是[[3, 4, 2]]，这是一个单层的LoD，那么构造出的
+        # LoDTensor就包含3个序列，其长度分别为3、4和2。
+        # 注意LoD是个列表的列表
         lod = [[3, 4, 2]]
         base_shape = [1]
-        # The range of random integers is [low, high]
+        # 整数随机数的范围是 [low, high]
         word = fluid.create_random_int_lodtensor(
             lod, base_shape, place, low=0, high=word_dict_len - 1)
         pred = fluid.create_random_int_lodtensor(
@@ -496,9 +446,13 @@ def infer(use_cuda, save_dirname=None):
             lod, base_shape, place, low=0, high=word_dict_len - 1)
         mark = fluid.create_random_int_lodtensor(
             lod, base_shape, place, low=0, high=mark_dict_len - 1)
+```
+
+- 执行预测
 
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
+```python
+        # 构造feed字典 {feed_target_name: feed_target_data}
+        # results是由预测目标构成的列表
         assert feed_target_names[0] == 'word_data'
         assert feed_target_names[1] == 'verb_data'
         assert feed_target_names[2] == 'ctx_n2_data'
diff --git a/07.label_semantic_roles/index.cn.html b/07.label_semantic_roles/index.cn.html
index 9972da6..dc80c66 100644
--- a/07.label_semantic_roles/index.cn.html
+++ b/07.label_semantic_roles/index.cn.html
@@ -193,7 +193,7 @@ conll05st-release/
 4. 构造以BIO法表示的标记；
 5. 依据词典获取词对应的整数索引。
 
-预处理完成之后一条训练样本包含9个特征，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
+预处理完成之后一条训练样本数据包含9个域，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
 
 | 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 |
 |---|---|---|---|---|
@@ -248,33 +248,34 @@ print('pred_dict_len: ', pred_dict_len)
 - 定义输入数据维度及模型超参数。
 
 ```python
-mark_dict_len = 2   # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
-word_dim = 32       # 词向量维度
-mark_dim = 5        # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
-hidden_dim = 512    # LSTM隐层向量的维度 ： 512 / 4
-depth = 8           # 栈式LSTM的深度
-mix_hidden_lr = 1e-3
+mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
+word_dim = 32        # 词向量维度
+mark_dim = 5         # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
+hidden_dim = 512     # LSTM隐层向量的维度 ： 512 / 4
+depth = 8            # 栈式LSTM的深度
+mix_hidden_lr = 1e-3 # linear_chain_crf层的基础学习率
 
-IS_SPARSE = True
-PASS_NUM = 10
-BATCH_SIZE = 10
+IS_SPARSE = True     # 是否以稀疏方式更新embedding
+PASS_NUM = 10        # 训练轮数
+BATCH_SIZE = 10      # batch size 大小
 
 embedding_name = 'emb'
 ```
 
-这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
+这里需要特别说明的是，参数 `hidden_dim = 512` 实际指定了LSTM隐层向量的维度为128，关于这一点请参考PaddlePaddle官方文档中[dynamic_lstm](http://www.paddlepaddle.org/documentation/docs/zh/1.2/api_cn/layers_cn.html#dynamic-lstm)的说明。
 
 - 如上文提到，我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数，在训练中不更新。
 
 ```python
-# 这里加载PaddlePaddle上版保存的二进制模型
+# 这里加载PaddlePaddle保存的二进制参数
 def load_parameter(file_name, h, w):
     with open(file_name, 'rb') as f:
         f.read(16)  # skip header.
         return np.fromfile(f, dtype=np.float32).reshape(h, w)
 ```
 
-- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习，主要的执行逻辑如下：
+ 1）为不同的输入特征分别定义embedding层
 
 ```python  
 def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
@@ -294,8 +295,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         is_sparse=IS_SPARSE)
 
     word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    # Since word vector lookup table is pre-trained, we won't update it this time.
-    # trainable being False prevents updating the lookup table during training.
+    # 因词向量是预训练好的，这里不再训练embedding表，
+    # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新
     emb_layers = [
         fluid.layers.embedding(
             size=[word_dict_len, word_dim],
@@ -305,9 +306,12 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
+```
+2) 定义深度双向LSTM结构
 
-    # 8 LSTM units are trained through alternating left-to-right / right-to-left order
-    # denoted by the variable `reverse`.
+```python
+    # 共有8个LSTM单元被训练，每个单元的方向为从左到右或从右到左，
+    # 由参数`is_reverse`确定
     hidden_0_layers = [
         fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
         for emb in emb_layers
@@ -322,19 +326,9 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         gate_activation='sigmoid',
         cell_activation='sigmoid')
 
-    # stack L-LSTM and R-LSTM with direct edges
+    # 用直连的边来堆叠L-LSTM、R-LSTM
     input_tmp = [hidden_0, lstm_0]
 
-    # In PaddlePaddle, state features and transition features of a CRF are implemented
-    # by a fully connected layer and a CRF layer seperately. The fully connected layer
-    # with linear activation learns the state features, here we use fluid.layers.sums
-    # (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle:
-    # fluid.layers.linear_chain_crf only
-    # learns the transition features, which is a cost layer and is the last layer of the network.
-    # fluid.layers.linear_chain_crf outputs the log probability of true tag sequence
-    # as the cost by given the input sequence and it requires the true tag sequence
-    # as target in the learning process.
-
     for i in range(1, depth):
         mix_hidden = fluid.layers.sums(input=[
             fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
@@ -365,55 +359,14 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
 - 我们根据网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法(momentum设置为0)，同时设定了学习率、正则等。
 
-- 数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
-
-- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。
-
-- 可以使用event_handler回调函数来观察训练过程，或进行测试等。这里我们打印了训练过程的cost，该回调函数是trainer.train函数里设定。
-
-- 通过trainer.train函数训练
-
 ```python
-def train(use_cuda, save_dirname=None, is_local=True):
-    # define network topology
-
-    # 句子序列
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 谓词
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 谓词上下文5个特征
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-
-    # 谓词上下区域标志
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-    # define network topology
     feature_out = db_lstm(**locals())
-
-    # 标注序列
     target = fluid.layers.data(
         name='target', shape=[1], dtype='int64', lod_level=1)
-
-    # 学习 CRF 的转移特征
     crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
+        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr))
 
     avg_cost = fluid.layers.mean(crf_cost)
 
@@ -425,31 +378,29 @@ def train(use_cuda, save_dirname=None, is_local=True):
             staircase=True))
 
     sgd_optimizer.minimize(avg_cost)
+```
 
-    # The CRF decoding layer is used for evaluation and inference.
-    # It shares weights with CRF layer.  The sharing of parameters among multiple layers
-    # is specified by using the same parameter name in these layers. If true tag sequence
-    # is provided in training process, `fluid.layers.crf_decoding` calculates labelling error
-    # for each input token and sums the error over the entire sequence.
-    # Otherwise, `fluid.layers.crf_decoding`  generates the labelling tags.
-    crf_decode = fluid.layers.crf_decoding(
-        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+- 数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
 
+```python
     train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
+        paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
+```
 
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
+- 通过feeding来指定每一个数据和data_layer的对应关系, 下面的feeding表示 conll05.test()产生数据的第0列对应的data_layer是`word`
 
+```python
     feeder = fluid.DataFeeder(
         feed_list=[
             word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
         ],
         place=place)
-    exe = fluid.Executor(place)
+```
+
+- 最后定义`train_loop()`函数来控制训练过程，并执行`train_loop()`函数
 
+```python
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
         embedding_param = fluid.global_scope().find_var(
@@ -462,19 +413,19 @@ def train(use_cuda, save_dirname=None, is_local=True):
         batch_id = 0
         for pass_id in six.moves.xrange(PASS_NUM):
             for data in train_data():
-                cost = exe.run(main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost])
+                cost = exe.run(
+                    main_program, feed=feeder.feed(data), fetch_list=[avg_cost])
                 cost = cost[0]
 
                 if batch_id % 10 == 0:
-                    print("avg_cost: " + str(cost))
+                    print("avg_cost:" + str(cost))
                     if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
+                        print("second per batch: " + str((
+                            time.time() - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
                     if float(cost) < 60.0:
                         if save_dirname is not None:
+                            # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [
                                 'word_data', 'verb_data', 'ctx_n2_data',
                                 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
@@ -490,7 +441,9 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
 ## 应用模型
 
-训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。
+训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例
+
+- 加载inference model
 
 ```python
 def infer(use_cuda, save_dirname=None):
@@ -502,26 +455,23 @@ def infer(use_cuda, save_dirname=None):
 
     inference_scope = fluid.core.Scope()
     with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be fed
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
+        # 使用fluid.io.load_inference_model加载inference_program，
+        # feed_target_names是模型的输入变量的名称，fetch_targets是预测对象
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+```
 
-        # Setup inputs by creating LoDTensors to represent sequences of words.
-        # Here each word is the basic element of these LoDTensors and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to
-        # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensors will have only
-        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for three sentences of
-        # length 3, 4 and 2, respectively.
-        # Note that lod info should be a list of lists.
+- 输入数据，这里构造假数据作为输入
+
+```python
+        # 设置输入，用LoDTensor来表示输入的词序列，这里每个词的形状
+        # base_shape都是[1]，是因为每个词都是用一个id来表示的。
+        # 假如基于长度的LoD是[[3, 4, 2]]，这是一个单层的LoD，那么构造出的
+        # LoDTensor就包含3个序列，其长度分别为3、4和2。
+        # 注意LoD是个列表的列表
         lod = [[3, 4, 2]]
         base_shape = [1]
-        # The range of random integers is [low, high]
+        # 整数随机数的范围是 [low, high]
         word = fluid.create_random_int_lodtensor(
             lod, base_shape, place, low=0, high=word_dict_len - 1)
         pred = fluid.create_random_int_lodtensor(
@@ -538,9 +488,13 @@ def infer(use_cuda, save_dirname=None):
             lod, base_shape, place, low=0, high=word_dict_len - 1)
         mark = fluid.create_random_int_lodtensor(
             lod, base_shape, place, low=0, high=mark_dict_len - 1)
+```
+
+- 执行预测
 
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
+```python
+        # 构造feed字典 {feed_target_name: feed_target_data}
+        # results是由预测目标构成的列表
         assert feed_target_names[0] == 'word_data'
         assert feed_target_names[1] == 'verb_data'
         assert feed_target_names[2] == 'ctx_n2_data'
diff --git a/07.label_semantic_roles/train.py b/07.label_semantic_roles/train.py
index 87c7f39..2952515 100644
--- a/07.label_semantic_roles/train.py
+++ b/07.label_semantic_roles/train.py
@@ -104,7 +104,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
 
 def train(use_cuda, save_dirname=None, is_local=True):
-    # define network topology
+    # define data layers
     word = fluid.layers.data(
         name='word_data', shape=[1], dtype='int64', lod_level=1)
     predicate = fluid.layers.data(
-- 
GitLab