From 0a31f10996afd2b1c5e8a3df5f80910f38aa1910 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Mon, 18 Feb 2019 00:18:14 +0800 Subject: [PATCH] Refine the cn doc of srl --- 07.label_semantic_roles/README.cn.md | 168 ++++++++++---------------- 07.label_semantic_roles/index.cn.html | 168 ++++++++++---------------- 07.label_semantic_roles/train.py | 2 +- 3 files changed, 123 insertions(+), 215 deletions(-) diff --git a/07.label_semantic_roles/README.cn.md b/07.label_semantic_roles/README.cn.md index 7faa20a..590e319 100644 --- a/07.label_semantic_roles/README.cn.md +++ b/07.label_semantic_roles/README.cn.md @@ -151,7 +151,7 @@ conll05st-release/ 4. 构造以BIO法表示的标记; 5. 依据词典获取词对应的整数索引。 -预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 +预处理完成之后一条训练样本数据包含9个域,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 | 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | |---|---|---|---|---| @@ -206,33 +206,34 @@ print('pred_dict_len: ', pred_dict_len) - 定义输入数据维度及模型超参数。 ```python -mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 -word_dim = 32 # 词向量维度 -mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 -hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 -depth = 8 # 栈式LSTM的深度 -mix_hidden_lr = 1e-3 +mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 +word_dim = 32 # 词向量维度 +mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 +hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 +depth = 8 # 栈式LSTM的深度 +mix_hidden_lr = 1e-3 # linear_chain_crf层的基础学习率 -IS_SPARSE = True -PASS_NUM = 10 -BATCH_SIZE = 10 +IS_SPARSE = True # 是否以稀疏方式更新embedding +PASS_NUM = 10 # 训练轮数 +BATCH_SIZE = 10 # batch size 大小 embedding_name = 'emb' ``` -这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。 +这里需要特别说明的是,参数 `hidden_dim = 512` 实际指定了LSTM隐层向量的维度为128,关于这一点请参考PaddlePaddle官方文档中[dynamic_lstm](http://www.paddlepaddle.org/documentation/docs/zh/1.2/api_cn/layers_cn.html#dynamic-lstm)的说明。 - 如上文提到,我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数,在训练中不更新。 ```python -# 这里加载PaddlePaddle上版保存的二进制模型 +# 这里加载PaddlePaddle保存的二进制参数 def load_parameter(file_name, h, w): with open(file_name, 'rb') as f: f.read(16) # skip header. return np.fromfile(f, dtype=np.float32).reshape(h, w) ``` -- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 +- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习,主要的执行逻辑如下: + 1)为不同的输入特征分别定义embedding层 ```python def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, @@ -252,8 +253,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, is_sparse=IS_SPARSE) word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - # Since word vector lookup table is pre-trained, we won't update it this time. - # trainable being False prevents updating the lookup table during training. + # 因词向量是预训练好的,这里不再训练embedding表, + # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新 emb_layers = [ fluid.layers.embedding( size=[word_dict_len, word_dim], @@ -263,9 +264,12 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) +``` +2) 定义深度双向LSTM结构 - # 8 LSTM units are trained through alternating left-to-right / right-to-left order - # denoted by the variable `reverse`. +```python + # 共有8个LSTM单元被训练,每个单元的方向为从左到右或从右到左, + # 由参数`is_reverse`确定 hidden_0_layers = [ fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers @@ -280,19 +284,9 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, gate_activation='sigmoid', cell_activation='sigmoid') - # stack L-LSTM and R-LSTM with direct edges + # 用直连的边来堆叠L-LSTM、R-LSTM input_tmp = [hidden_0, lstm_0] - # In PaddlePaddle, state features and transition features of a CRF are implemented - # by a fully connected layer and a CRF layer seperately. The fully connected layer - # with linear activation learns the state features, here we use fluid.layers.sums - # (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle: - # fluid.layers.linear_chain_crf only - # learns the transition features, which is a cost layer and is the last layer of the network. - # fluid.layers.linear_chain_crf outputs the log probability of true tag sequence - # as the cost by given the input sequence and it requires the true tag sequence - # as target in the learning process. - for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), @@ -323,55 +317,14 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, - 我们根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的SGD方法(momentum设置为0),同时设定了学习率、正则等。 -- 数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 - -- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。 - -- 可以使用event_handler回调函数来观察训练过程,或进行测试等。这里我们打印了训练过程的cost,该回调函数是trainer.train函数里设定。 - -- 通过trainer.train函数训练 - ```python -def train(use_cuda, save_dirname=None, is_local=True): - # define network topology - - # 句子序列 - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词 - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词上下文5个特征 - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词上下区域标志 - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - # define network topology feature_out = db_lstm(**locals()) - - # 标注序列 target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) - - # 学习 CRF 的转移特征 crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=mix_hidden_lr)) + param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) @@ -383,31 +336,29 @@ def train(use_cuda, save_dirname=None, is_local=True): staircase=True)) sgd_optimizer.minimize(avg_cost) +``` - # The CRF decoding layer is used for evaluation and inference. - # It shares weights with CRF layer. The sharing of parameters among multiple layers - # is specified by using the same parameter name in these layers. If true tag sequence - # is provided in training process, `fluid.layers.crf_decoding` calculates labelling error - # for each input token and sums the error over the entire sequence. - # Otherwise, `fluid.layers.crf_decoding` generates the labelling tags. - crf_decode = fluid.layers.crf_decoding( - input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) +- 数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 +```python train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), + paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) +``` - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - +- 通过feeding来指定每一个数据和data_layer的对应关系, 下面的feeding表示 conll05.test()产生数据的第0列对应的data_layer是`word` +```python feeder = fluid.DataFeeder( feed_list=[ word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target ], place=place) - exe = fluid.Executor(place) +``` + +- 最后定义`train_loop()`函数来控制训练过程,并执行`train_loop()`函数 +```python def train_loop(main_program): exe.run(fluid.default_startup_program()) embedding_param = fluid.global_scope().find_var( @@ -420,19 +371,19 @@ def train(use_cuda, save_dirname=None, is_local=True): batch_id = 0 for pass_id in six.moves.xrange(PASS_NUM): for data in train_data(): - cost = exe.run(main_program, - feed=feeder.feed(data), - fetch_list=[avg_cost]) + cost = exe.run( + main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) cost = cost[0] if batch_id % 10 == 0: - print("avg_cost: " + str(cost)) + print("avg_cost:" + str(cost)) if batch_id != 0: - print("second per batch: " + str((time.time( - ) - start_time) / batch_id)) + print("second per batch: " + str(( + time.time() - start_time) / batch_id)) # Set the threshold low to speed up the CI test if float(cost) < 60.0: if save_dirname is not None: + # TODO(liuyiqun): Change the target to crf_decode fluid.io.save_inference_model(save_dirname, [ 'word_data', 'verb_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', @@ -448,7 +399,9 @@ def train(use_cuda, save_dirname=None, is_local=True): ## 应用模型 -训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。 +训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例 + +- 加载inference model ```python def infer(use_cuda, save_dirname=None): @@ -460,26 +413,23 @@ def infer(use_cuda, save_dirname=None): inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, - # the feed_target_names (the names of variables that will be fed - # data using feed operators), and the fetch_targets (variables that - # we want to obtain data from using fetch operators). + # 使用fluid.io.load_inference_model加载inference_program, + # feed_target_names是模型的输入变量的名称,fetch_targets是预测对象 [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) +``` - # Setup inputs by creating LoDTensors to represent sequences of words. - # Here each word is the basic element of these LoDTensors and the shape of - # each word (base_shape) should be [1] since it is simply an index to - # look up for the corresponding word vector. - # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], - # which has only one lod level. Then the created LoDTensors will have only - # one higher level structure (sequence of words, or sentence) than the basic - # element (word). Hence the LoDTensor will hold data for three sentences of - # length 3, 4 and 2, respectively. - # Note that lod info should be a list of lists. +- 输入数据,这里构造假数据作为输入 + +```python + # 设置输入,用LoDTensor来表示输入的词序列,这里每个词的形状 + # base_shape都是[1],是因为每个词都是用一个id来表示的。 + # 假如基于长度的LoD是[[3, 4, 2]],这是一个单层的LoD,那么构造出的 + # LoDTensor就包含3个序列,其长度分别为3、4和2。 + # 注意LoD是个列表的列表 lod = [[3, 4, 2]] base_shape = [1] - # The range of random integers is [low, high] + # 整数随机数的范围是 [low, high] word = fluid.create_random_int_lodtensor( lod, base_shape, place, low=0, high=word_dict_len - 1) pred = fluid.create_random_int_lodtensor( @@ -496,9 +446,13 @@ def infer(use_cuda, save_dirname=None): lod, base_shape, place, low=0, high=word_dict_len - 1) mark = fluid.create_random_int_lodtensor( lod, base_shape, place, low=0, high=mark_dict_len - 1) +``` + +- 执行预测 - # Construct feed as a dictionary of {feed_target_name: feed_target_data} - # and results will contain a list of data corresponding to fetch_targets. +```python + # 构造feed字典 {feed_target_name: feed_target_data} + # results是由预测目标构成的列表 assert feed_target_names[0] == 'word_data' assert feed_target_names[1] == 'verb_data' assert feed_target_names[2] == 'ctx_n2_data' diff --git a/07.label_semantic_roles/index.cn.html b/07.label_semantic_roles/index.cn.html index 9972da6..dc80c66 100644 --- a/07.label_semantic_roles/index.cn.html +++ b/07.label_semantic_roles/index.cn.html @@ -193,7 +193,7 @@ conll05st-release/ 4. 构造以BIO法表示的标记; 5. 依据词典获取词对应的整数索引。 -预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 +预处理完成之后一条训练样本数据包含9个域,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 | 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | |---|---|---|---|---| @@ -248,33 +248,34 @@ print('pred_dict_len: ', pred_dict_len) - 定义输入数据维度及模型超参数。 ```python -mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 -word_dim = 32 # 词向量维度 -mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 -hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 -depth = 8 # 栈式LSTM的深度 -mix_hidden_lr = 1e-3 +mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 +word_dim = 32 # 词向量维度 +mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 +hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 +depth = 8 # 栈式LSTM的深度 +mix_hidden_lr = 1e-3 # linear_chain_crf层的基础学习率 -IS_SPARSE = True -PASS_NUM = 10 -BATCH_SIZE = 10 +IS_SPARSE = True # 是否以稀疏方式更新embedding +PASS_NUM = 10 # 训练轮数 +BATCH_SIZE = 10 # batch size 大小 embedding_name = 'emb' ``` -这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。 +这里需要特别说明的是,参数 `hidden_dim = 512` 实际指定了LSTM隐层向量的维度为128,关于这一点请参考PaddlePaddle官方文档中[dynamic_lstm](http://www.paddlepaddle.org/documentation/docs/zh/1.2/api_cn/layers_cn.html#dynamic-lstm)的说明。 - 如上文提到,我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数,在训练中不更新。 ```python -# 这里加载PaddlePaddle上版保存的二进制模型 +# 这里加载PaddlePaddle保存的二进制参数 def load_parameter(file_name, h, w): with open(file_name, 'rb') as f: f.read(16) # skip header. return np.fromfile(f, dtype=np.float32).reshape(h, w) ``` -- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 +- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习,主要的执行逻辑如下: + 1)为不同的输入特征分别定义embedding层 ```python def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, @@ -294,8 +295,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, is_sparse=IS_SPARSE) word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - # Since word vector lookup table is pre-trained, we won't update it this time. - # trainable being False prevents updating the lookup table during training. + # 因词向量是预训练好的,这里不再训练embedding表, + # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新 emb_layers = [ fluid.layers.embedding( size=[word_dict_len, word_dim], @@ -305,9 +306,12 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) +``` +2) 定义深度双向LSTM结构 - # 8 LSTM units are trained through alternating left-to-right / right-to-left order - # denoted by the variable `reverse`. +```python + # 共有8个LSTM单元被训练,每个单元的方向为从左到右或从右到左, + # 由参数`is_reverse`确定 hidden_0_layers = [ fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers @@ -322,19 +326,9 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, gate_activation='sigmoid', cell_activation='sigmoid') - # stack L-LSTM and R-LSTM with direct edges + # 用直连的边来堆叠L-LSTM、R-LSTM input_tmp = [hidden_0, lstm_0] - # In PaddlePaddle, state features and transition features of a CRF are implemented - # by a fully connected layer and a CRF layer seperately. The fully connected layer - # with linear activation learns the state features, here we use fluid.layers.sums - # (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle: - # fluid.layers.linear_chain_crf only - # learns the transition features, which is a cost layer and is the last layer of the network. - # fluid.layers.linear_chain_crf outputs the log probability of true tag sequence - # as the cost by given the input sequence and it requires the true tag sequence - # as target in the learning process. - for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), @@ -365,55 +359,14 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, - 我们根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的SGD方法(momentum设置为0),同时设定了学习率、正则等。 -- 数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 - -- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。 - -- 可以使用event_handler回调函数来观察训练过程,或进行测试等。这里我们打印了训练过程的cost,该回调函数是trainer.train函数里设定。 - -- 通过trainer.train函数训练 - ```python -def train(use_cuda, save_dirname=None, is_local=True): - # define network topology - - # 句子序列 - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词 - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词上下文5个特征 - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词上下区域标志 - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - # define network topology feature_out = db_lstm(**locals()) - - # 标注序列 target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) - - # 学习 CRF 的转移特征 crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=mix_hidden_lr)) + param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) @@ -425,31 +378,29 @@ def train(use_cuda, save_dirname=None, is_local=True): staircase=True)) sgd_optimizer.minimize(avg_cost) +``` - # The CRF decoding layer is used for evaluation and inference. - # It shares weights with CRF layer. The sharing of parameters among multiple layers - # is specified by using the same parameter name in these layers. If true tag sequence - # is provided in training process, `fluid.layers.crf_decoding` calculates labelling error - # for each input token and sums the error over the entire sequence. - # Otherwise, `fluid.layers.crf_decoding` generates the labelling tags. - crf_decode = fluid.layers.crf_decoding( - input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) +- 数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 +```python train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), + paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) +``` - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - +- 通过feeding来指定每一个数据和data_layer的对应关系, 下面的feeding表示 conll05.test()产生数据的第0列对应的data_layer是`word` +```python feeder = fluid.DataFeeder( feed_list=[ word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target ], place=place) - exe = fluid.Executor(place) +``` + +- 最后定义`train_loop()`函数来控制训练过程,并执行`train_loop()`函数 +```python def train_loop(main_program): exe.run(fluid.default_startup_program()) embedding_param = fluid.global_scope().find_var( @@ -462,19 +413,19 @@ def train(use_cuda, save_dirname=None, is_local=True): batch_id = 0 for pass_id in six.moves.xrange(PASS_NUM): for data in train_data(): - cost = exe.run(main_program, - feed=feeder.feed(data), - fetch_list=[avg_cost]) + cost = exe.run( + main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) cost = cost[0] if batch_id % 10 == 0: - print("avg_cost: " + str(cost)) + print("avg_cost:" + str(cost)) if batch_id != 0: - print("second per batch: " + str((time.time( - ) - start_time) / batch_id)) + print("second per batch: " + str(( + time.time() - start_time) / batch_id)) # Set the threshold low to speed up the CI test if float(cost) < 60.0: if save_dirname is not None: + # TODO(liuyiqun): Change the target to crf_decode fluid.io.save_inference_model(save_dirname, [ 'word_data', 'verb_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', @@ -490,7 +441,9 @@ def train(use_cuda, save_dirname=None, is_local=True): ## 应用模型 -训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。 +训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例 + +- 加载inference model ```python def infer(use_cuda, save_dirname=None): @@ -502,26 +455,23 @@ def infer(use_cuda, save_dirname=None): inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, - # the feed_target_names (the names of variables that will be fed - # data using feed operators), and the fetch_targets (variables that - # we want to obtain data from using fetch operators). + # 使用fluid.io.load_inference_model加载inference_program, + # feed_target_names是模型的输入变量的名称,fetch_targets是预测对象 [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) +``` - # Setup inputs by creating LoDTensors to represent sequences of words. - # Here each word is the basic element of these LoDTensors and the shape of - # each word (base_shape) should be [1] since it is simply an index to - # look up for the corresponding word vector. - # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], - # which has only one lod level. Then the created LoDTensors will have only - # one higher level structure (sequence of words, or sentence) than the basic - # element (word). Hence the LoDTensor will hold data for three sentences of - # length 3, 4 and 2, respectively. - # Note that lod info should be a list of lists. +- 输入数据,这里构造假数据作为输入 + +```python + # 设置输入,用LoDTensor来表示输入的词序列,这里每个词的形状 + # base_shape都是[1],是因为每个词都是用一个id来表示的。 + # 假如基于长度的LoD是[[3, 4, 2]],这是一个单层的LoD,那么构造出的 + # LoDTensor就包含3个序列,其长度分别为3、4和2。 + # 注意LoD是个列表的列表 lod = [[3, 4, 2]] base_shape = [1] - # The range of random integers is [low, high] + # 整数随机数的范围是 [low, high] word = fluid.create_random_int_lodtensor( lod, base_shape, place, low=0, high=word_dict_len - 1) pred = fluid.create_random_int_lodtensor( @@ -538,9 +488,13 @@ def infer(use_cuda, save_dirname=None): lod, base_shape, place, low=0, high=word_dict_len - 1) mark = fluid.create_random_int_lodtensor( lod, base_shape, place, low=0, high=mark_dict_len - 1) +``` + +- 执行预测 - # Construct feed as a dictionary of {feed_target_name: feed_target_data} - # and results will contain a list of data corresponding to fetch_targets. +```python + # 构造feed字典 {feed_target_name: feed_target_data} + # results是由预测目标构成的列表 assert feed_target_names[0] == 'word_data' assert feed_target_names[1] == 'verb_data' assert feed_target_names[2] == 'ctx_n2_data' diff --git a/07.label_semantic_roles/train.py b/07.label_semantic_roles/train.py index 87c7f39..2952515 100644 --- a/07.label_semantic_roles/train.py +++ b/07.label_semantic_roles/train.py @@ -104,7 +104,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, def train(use_cuda, save_dirname=None, is_local=True): - # define network topology + # define data layers word = fluid.layers.data( name='word_data', shape=[1], dtype='int64', lod_level=1) predicate = fluid.layers.data( -- GitLab