diff --git a/06.label_semantic_roles/README.en.md b/06.label_semantic_roles/README.en.md index f40269c3a3e93d59e425ceaad4985e2106651821..f62437a90441e6e6bf5f91c2d53ff479b0a0535f 100644 --- a/06.label_semantic_roles/README.en.md +++ b/06.label_semantic_roles/README.en.md @@ -373,11 +373,11 @@ crf_cost = paddle.layer.crf( ```python crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', size=label_dict_len, input=feature_out, label=target, param_attr=paddle.attr.Param(name='crfw')) +evaluator.sum(input=crf_dec) ``` ## Train model @@ -420,7 +420,8 @@ optimizer = paddle.optimizer.Momentum( trainer = paddle.trainer.SGD(cost=crf_cost, parameters=parameters, - update_equation=optimizer) + update_equation=optimizer, + extra_layers=crf_dec) ``` ### Trainer @@ -455,8 +456,19 @@ feeding = { def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if event.batch_id % 1000 == 0: + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics) + + if isinstance(event, paddle.event.EndPass): + # save parameters + with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: + parameters.to_tar(f) + + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) ``` `trainer.train` will train the model. @@ -469,6 +481,42 @@ trainer.train( feeding=feeding) ``` +### Application + +Aftern training is done, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs does not include the ground truth label. + +```python +predict = paddle.layer.crf_decoding( + size=label_dict_len, + input=feature_out, + param_attr=paddle.attr.Param(name='crfw')) +``` + +Here, using one testing sample as an example. + +```python +test_creator = paddle.dataset.conll05.test() +test_data = [] +for item in test_creator(): + test_data.append(item[0:8]) + if len(test_data) == 1: + break +``` + +The inference interface `paddle.infer` returns the index of predicting labels. Then printing the tagging results based dictionary `labels_reverse`. + + +```python +labs = paddle.infer( + output_layer=predict, parameters=parameters, input=test_data, field='id') +assert len(labs) == len(test_data[0][0]) +labels_reverse={} +for (k,v) in label_dict.items(): + labels_reverse[v]=k +pre_lab = [labels_reverse[i] for i in labs] +print pre_lab +``` + ## Conclusion Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we use SRL as an example to illustrate using PaddlePaddle to do sequence tagging tasks. The models proposed are from our published paper\[[10](#Reference)\]. We only use test data for illustration since the training data on the CoNLL 2005 dataset is not completely public. This aims to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models in terms of performance. Please check out our paper for more information and discussions. diff --git a/06.label_semantic_roles/README.md b/06.label_semantic_roles/README.md index fe92d852ad3e2a79e5be1d41aaef4ee26df182a1..315bdf23552df502ad2b3bd9a2bb5892d25fab53 100644 --- a/06.label_semantic_roles/README.md +++ b/06.label_semantic_roles/README.md @@ -189,6 +189,7 @@ conll05st-release/ ```python import math import numpy as np +import gzip import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 @@ -346,15 +347,15 @@ crf_cost = paddle.layer.crf( learning_rate=mix_hidden_lr)) ``` -- CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型。 +- CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型。在训练中,`evaluator.sum`对CRF译码层统计结果进行求和并得到平均标记错误率。 ```python crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', size=label_dict_len, input=feature_out, label=target, param_attr=paddle.attr.Param(name='crfw')) +evaluator.sum(input=crf_dec) ``` ## 训练模型 @@ -365,7 +366,7 @@ crf_dec = paddle.layer.crf_decoding( ```python # create parameters -parameters = paddle.parameters.create([crf_cost, crf_dec]) +parameters = paddle.parameters.create(crf_cost) ``` 可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。 @@ -400,7 +401,8 @@ optimizer = paddle.optimizer.Momentum( trainer = paddle.trainer.SGD(cost=crf_cost, parameters=parameters, - update_equation=optimizer) + update_equation=optimizer, + extra_layers=crf_dec) ``` ### 训练 @@ -436,8 +438,19 @@ feeding = { def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if event.batch_id % 1000 == 0: + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics) + + if isinstance(event, paddle.event.EndPass): + # save parameters + with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: + parameters.to_tar(f) + + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) ``` 通过`trainer.train`函数训练: @@ -450,6 +463,41 @@ trainer.train( feeding=feeding) ``` +### 应用模型 + +训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。预测时使用 `paddle.layer.crf_decoding`,和训练不同的是,该层没有正确的标签层作为输入。如下所示: + +```python +predict = paddle.layer.crf_decoding( + size=label_dict_len, + input=feature_out, + param_attr=paddle.attr.Param(name='crfw')) +``` + +这里选用测试集的一条数据作为示例。 + +```python +test_creator = paddle.dataset.conll05.test() +test_data = [] +for item in test_creator(): + test_data.append(item[0:8]) + if len(test_data) == 1: + break +``` + +推断接口`paddle.infer`返回标签的索引,并查询词典`labels_reverse`,打印出标记的结果。 + +```python +labs = paddle.infer( + output_layer=predict, parameters=parameters, input=test_data, field='id') +assert len(labs) == len(test_data[0][0]) +labels_reverse={} +for (k,v) in label_dict.items(): + labels_reverse[v]=k +pre_lab = [labels_reverse[i] for i in labs] +print pre_lab +``` + ## 总结 语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例,介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放,教程中只使用测试数据作为示例。在这个过程中,我们希望减少对其它自然语言处理工具的依赖,利用神经网络数据驱动、端到端学习的能力,得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。 diff --git a/06.label_semantic_roles/index.en.html b/06.label_semantic_roles/index.en.html index 74b6d477f0664222685f9a5de51fdb9dda9e8f79..4c4e22a9ff785f68e3f7687649982a476ef71c25 100644 --- a/06.label_semantic_roles/index.en.html +++ b/06.label_semantic_roles/index.en.html @@ -415,11 +415,11 @@ crf_cost = paddle.layer.crf( ```python crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', size=label_dict_len, input=feature_out, label=target, param_attr=paddle.attr.Param(name='crfw')) +evaluator.sum(input=crf_dec) ``` ## Train model @@ -462,7 +462,8 @@ optimizer = paddle.optimizer.Momentum( trainer = paddle.trainer.SGD(cost=crf_cost, parameters=parameters, - update_equation=optimizer) + update_equation=optimizer, + extra_layers=crf_dec) ``` ### Trainer @@ -497,8 +498,19 @@ feeding = { def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if event.batch_id % 1000 == 0: + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics) + + if isinstance(event, paddle.event.EndPass): + # save parameters + with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: + parameters.to_tar(f) + + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) ``` `trainer.train` will train the model. @@ -511,6 +523,42 @@ trainer.train( feeding=feeding) ``` +### Application + +Aftern training is done, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs does not include the ground truth label. + +```python +predict = paddle.layer.crf_decoding( + size=label_dict_len, + input=feature_out, + param_attr=paddle.attr.Param(name='crfw')) +``` + +Here, using one testing sample as an example. + +```python +test_creator = paddle.dataset.conll05.test() +test_data = [] +for item in test_creator(): + test_data.append(item[0:8]) + if len(test_data) == 1: + break +``` + +The inference interface `paddle.infer` returns the index of predicting labels. Then printing the tagging results based dictionary `labels_reverse`. + + +```python +labs = paddle.infer( + output_layer=predict, parameters=parameters, input=test_data, field='id') +assert len(labs) == len(test_data[0][0]) +labels_reverse={} +for (k,v) in label_dict.items(): + labels_reverse[v]=k +pre_lab = [labels_reverse[i] for i in labs] +print pre_lab +``` + ## Conclusion Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we use SRL as an example to illustrate using PaddlePaddle to do sequence tagging tasks. The models proposed are from our published paper\[[10](#Reference)\]. We only use test data for illustration since the training data on the CoNLL 2005 dataset is not completely public. This aims to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models in terms of performance. Please check out our paper for more information and discussions. diff --git a/06.label_semantic_roles/index.html b/06.label_semantic_roles/index.html index 4e7806580f7b1d83e50fd43b079ce6631c64da16..85c9bceb8c60dbb246ff70fd74ddfc51d7301b8c 100644 --- a/06.label_semantic_roles/index.html +++ b/06.label_semantic_roles/index.html @@ -231,6 +231,7 @@ conll05st-release/ ```python import math import numpy as np +import gzip import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 @@ -388,15 +389,15 @@ crf_cost = paddle.layer.crf( learning_rate=mix_hidden_lr)) ``` -- CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型。 +- CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型。在训练中,`evaluator.sum`对CRF译码层统计结果进行求和并得到平均标记错误率。 ```python crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', size=label_dict_len, input=feature_out, label=target, param_attr=paddle.attr.Param(name='crfw')) +evaluator.sum(input=crf_dec) ``` ## 训练模型 @@ -407,7 +408,7 @@ crf_dec = paddle.layer.crf_decoding( ```python # create parameters -parameters = paddle.parameters.create([crf_cost, crf_dec]) +parameters = paddle.parameters.create(crf_cost) ``` 可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。 @@ -442,7 +443,8 @@ optimizer = paddle.optimizer.Momentum( trainer = paddle.trainer.SGD(cost=crf_cost, parameters=parameters, - update_equation=optimizer) + update_equation=optimizer, + extra_layers=crf_dec) ``` ### 训练 @@ -478,8 +480,19 @@ feeding = { def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if event.batch_id % 1000 == 0: + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics) + + if isinstance(event, paddle.event.EndPass): + # save parameters + with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: + parameters.to_tar(f) + + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) ``` 通过`trainer.train`函数训练: @@ -492,6 +505,41 @@ trainer.train( feeding=feeding) ``` +### 应用模型 + +训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。预测时使用 `paddle.layer.crf_decoding`,和训练不同的是,该层没有正确的标签层作为输入。如下所示: + +```python +predict = paddle.layer.crf_decoding( + size=label_dict_len, + input=feature_out, + param_attr=paddle.attr.Param(name='crfw')) +``` + +这里选用测试集的一条数据作为示例。 + +```python +test_creator = paddle.dataset.conll05.test() +test_data = [] +for item in test_creator(): + test_data.append(item[0:8]) + if len(test_data) == 1: + break +``` + +推断接口`paddle.infer`返回标签的索引,并查询词典`labels_reverse`,打印出标记的结果。 + +```python +labs = paddle.infer( + output_layer=predict, parameters=parameters, input=test_data, field='id') +assert len(labs) == len(test_data[0][0]) +labels_reverse={} +for (k,v) in label_dict.items(): + labels_reverse[v]=k +pre_lab = [labels_reverse[i] for i in labs] +print pre_lab +``` + ## 总结 语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例,介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放,教程中只使用测试数据作为示例。在这个过程中,我们希望减少对其它自然语言处理工具的依赖,利用神经网络数据驱动、端到端学习的能力,得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。 diff --git a/06.label_semantic_roles/train.py b/06.label_semantic_roles/train.py index 11b3709c6e9c76e84798c076cb9e1c7d30a98efc..be00031849a8dcd90095119767a9d607a8b271e7 100644 --- a/06.label_semantic_roles/train.py +++ b/06.label_semantic_roles/train.py @@ -1,25 +1,30 @@ import math import numpy as np +import gzip import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 +import paddle.v2.evaluator as evaluator +word_dict, verb_dict, label_dict = conll05.get_dict() +word_dict_len = len(word_dict) +label_dict_len = len(label_dict) +pred_len = len(verb_dict) -def db_lstm(): - word_dict, verb_dict, label_dict = conll05.get_dict() - word_dict_len = len(word_dict) - label_dict_len = len(label_dict) - pred_len = len(verb_dict) +mark_dict_len = 2 +word_dim = 32 +mark_dim = 5 +hidden_dim = 512 +depth = 8 +default_std = 1 / math.sqrt(hidden_dim) / 3.0 +mix_hidden_lr = 1e-3 - mark_dict_len = 2 - word_dim = 32 - mark_dim = 5 - hidden_dim = 512 - depth = 8 - #8 features - def d_type(size): - return paddle.data_type.integer_value_sequence(size) +def d_type(size): + return paddle.data_type.integer_value_sequence(size) + +def db_lstm(): + #8 features word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) @@ -30,11 +35,8 @@ def db_lstm(): ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) - target = paddle.layer.data(name='target', type=d_type(label_dict_len)) - emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True) std_0 = paddle.attr.Param(initial_std=0.) - default_std = 1 / math.sqrt(hidden_dim) / 3.0 std_default = paddle.attr.Param(initial_std=default_std) predicate_embedding = paddle.layer.embedding( @@ -60,7 +62,6 @@ def db_lstm(): input=emb, param_attr=std_default) for emb in emb_layers ]) - mix_hidden_lr = 1e-3 lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) hidden_para_attr = paddle.attr.Param( initial_std=default_std, learning_rate=mix_hidden_lr) @@ -108,21 +109,7 @@ def db_lstm(): input=input_tmp[1], param_attr=lstm_para_attr) ], ) - crf_cost = paddle.layer.crf( - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param( - name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) - - crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param(name='crfw')) - - return crf_cost, crf_dec + return feature_out def load_parameter(file_name, h, w): @@ -135,10 +122,24 @@ def main(): paddle.init(use_gpu=False, trainer_count=1) # define network topology - crf_cost, crf_dec = db_lstm() + feature_out = db_lstm() + target = paddle.layer.data(name='target', type=d_type(label_dict_len)) + crf_cost = paddle.layer.crf( + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) + + crf_dec = paddle.layer.crf_decoding( + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) + evaluator.sum(input=crf_dec) # create parameters - parameters = paddle.parameters.create([crf_cost, crf_dec]) + parameters = paddle.parameters.create(crf_cost) parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) # create optimizer @@ -150,7 +151,10 @@ def main(): average_window=0.5, max_average_window=10000), ) trainer = paddle.trainer.SGD( - cost=crf_cost, parameters=parameters, update_equation=optimizer) + cost=crf_cost, + parameters=parameters, + update_equation=optimizer, + extra_layers=crf_dec) reader = paddle.batch( paddle.reader.shuffle(conll05.test(), buf_size=8192), batch_size=10) @@ -170,15 +174,50 @@ def main(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) + print "Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + if event.batch_id % 1000 == 0: + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, Batch %d, %s" % ( + event.pass_id, event.batch_id, result.metrics) + + if isinstance(event, paddle.event.EndPass): + # save parameters + with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f: + parameters.to_tar(f) + + result = trainer.test(reader=reader, feeding=feeding) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) trainer.train( reader=reader, event_handler=event_handler, - num_passes=10000, + num_passes=1, feeding=feeding) + test_creator = paddle.dataset.conll05.test() + test_data = [] + for item in test_creator(): + test_data.append(item[0:8]) + if len(test_data) == 1: + break + + predict = paddle.layer.crf_decoding( + size=label_dict_len, + input=feature_out, + param_attr=paddle.attr.Param(name='crfw')) + probs = paddle.infer( + output_layer=predict, + parameters=parameters, + input=test_data, + field='id') + assert len(probs) == len(test_data[0][0]) + labels_reverse = {} + for (k, v) in label_dict.items(): + labels_reverse[v] = k + pre_lab = [labels_reverse[i] for i in probs] + print pre_lab + if __name__ == '__main__': main()