diff --git a/ltr/README.md b/ltr/README.md index a8fa8195537a92af1fffcad9c3af15005cbb18ae..d4d80e3ef16083168a491d37a78a7da1cf289b51 100644 --- a/ltr/README.md +++ b/ltr/README.md @@ -1,4 +1,4 @@ -## 排序学习(LearningToRank) +# 排序学习(Learning To Rank) 排序学习技术\[[1](#参考文献1)\]是构建排序模型的机器学习方法,在信息检索、自然语言处理,数据挖掘等机器学场景中具有重要作用。排序学习的主要目的是对给定一组文档,对任意查询请求给出反映相关性的文档排序。在本例子中,利用标注过的语料库训练两种经典排序模型RankNet[[4](#参考文献4)\]和LamdaRank[[6](#参考文献6)\],分别可以生成对应的排序模型,能够对任意查询请求,给出相关性文档排序。 @@ -238,9 +238,9 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}}=-\frac{\sigma }{1+e^{\sigma ( ```python import paddle.v2 as paddle -def lambdaRank(input_dim): +def lambda_rank(input_dim): """ - lambdaRank is a ListWise Rank Model, input data and label must be sequence + lambda_rank is a ListWise Rank Model, input data and label must be sequence https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf parameters : input_dim, one document's dense feature vector dimension diff --git a/ltr/lambda_rank.py b/ltr/lambda_rank.py index 7a213791f300f45387c44c6af0ae8db7879b0d5e..123edc353cee9447a801b9300099df9d8fd64559 100644 --- a/ltr/lambda_rank.py +++ b/ltr/lambda_rank.py @@ -4,18 +4,16 @@ import paddle.v2 as paddle import numpy as np import functools -#lambdaRank is listwise learning to rank model - -def lambdaRank(input_dim): +def lambda_rank(input_dim): """ - lambdaRank is a ListWise Rank Model, input data and label must be sequence + lambda_rank is a Listwise rank model, the input data and label must be sequences. https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf parameters : input_dim, one document's dense feature vector dimension - dense_vector_sequence format - [[f, ...], [f, ...], ...], f is represent for an float or int number + format of the dense_vector_sequence: + [[f, ...], [f, ...], ...], f is a float or an int number """ label = paddle.layer.data("label", paddle.data_type.dense_vector_sequence(1)) @@ -48,7 +46,7 @@ def lambdaRank(input_dim): return cost, output -def train_lambdaRank(num_passes): +def train_lambda_rank(num_passes): # listwise input sequence fill_default_train = functools.partial( paddle.dataset.mq2007.train, format="listwise") @@ -60,7 +58,7 @@ def train_lambdaRank(num_passes): # mq2007 input_dim = 46, dense format input_dim = 46 - cost, output = lambdaRank(input_dim) + cost, output = lambda_rank(input_dim) parameters = paddle.parameters.create(cost) trainer = paddle.trainer.SGD( @@ -76,7 +74,7 @@ def train_lambdaRank(num_passes): if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_reader, feeding=feeding) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - with gzip.open("lambdaRank_params_%d.tar.gz" % (event.pass_id), + with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id), "w") as f: parameters.to_tar(f) @@ -88,17 +86,17 @@ def train_lambdaRank(num_passes): num_passes=num_passes) -def lambdaRank_infer(pass_id): +def lambda_rank_infer(pass_id): """ - lambdaRank model inference interface + lambda_rank model inference interface parameters: pass_id : inference model in pass_id """ print "Begin to Infer..." input_dim = 46 - output = lambdaRank(input_dim) + output = lambda_rank(input_dim) parameters = paddle.parameters.Parameters.from_tar( - gzip.open("lambdaRank_params_%d.tar.gz" % (pass_id - 1))) + gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1))) infer_query_id = None infer_data = [] @@ -119,6 +117,6 @@ def lambdaRank_infer(pass_id): if __name__ == '__main__': - paddle.init(use_gpu=False, trainer_count=4) - train_lambdaRank(2) - lambdaRank_infer(pass_id=1) + paddle.init(use_gpu=False, trainer_count=1) + train_lambda_rank(2) + lambda_rank_infer(pass_id=1) diff --git a/sequence_tagging_for_ner/README.md b/sequence_tagging_for_ner/README.md index e0c488c9aa5298d9d316d45f121c68dc41be73b2..fadfe4564a912641119d6864e36d2ba0874ff96e 100644 --- a/sequence_tagging_for_ner/README.md +++ b/sequence_tagging_for_ner/README.md @@ -1,24 +1,23 @@ # 命名实体识别 -## 背景说明 - 命名实体识别(Named Entity Recognition,NER)又称作“专名识别”,是指识别文本中具有特定意义的实体,主要包括人名、地名、机构名、专有名词等,是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分,可以将其作为序列标注问题解决。 -序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)],我们这里限定序列标注为Segment Classification,即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务,由于需要标识边界,一般采用[BIO方式](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集,如下是一个NER的标注结果示例: +序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)],本例只考虑Segment Classification,即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务,由于需要标识边界,一般采用[BIO方式](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集,如下是一个NER的标注结果示例: -