提交 a87a3c96 编写于 作者: D dzhwinter

"update comment"

上级 561b6c82
...@@ -8,6 +8,15 @@ import functools ...@@ -8,6 +8,15 @@ import functools
def lambdaRank(input_dim): def lambdaRank(input_dim):
"""
lambdaRank is a ListWise Rank Model, input data and label must be sequence
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
parameters :
input_dim, one document's dense feature vector dimension
dense_vector_sequence format
[[f, ...], [f, ...], ...], f is represent for an float or int number
"""
label = paddle.layer.data("label", label = paddle.layer.data("label",
paddle.data_type.dense_vector_sequence(1)) paddle.data_type.dense_vector_sequence(1))
data = paddle.layer.data("data", data = paddle.layer.data("data",
...@@ -16,14 +25,24 @@ def lambdaRank(input_dim): ...@@ -16,14 +25,24 @@ def lambdaRank(input_dim):
# hidden layer # hidden layer
hd1 = paddle.layer.fc( hd1 = paddle.layer.fc(
input=data, input=data,
size=128,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01))
hd2 = paddle.layer.fc(
input=hd1,
size=10, size=10,
act=paddle.activation.Tanh(), act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01)) param_attr=paddle.attr.Param(initial_std=0.01))
output = paddle.layer.fc( output = paddle.layer.fc(
input=hd1, input=hd2,
size=1, size=1,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(initial_std=0.01)) param_attr=paddle.attr.Param(initial_std=0.01))
# evaluator
evaluator = paddle.evaluator.auc(input=output, label=label)
# cost layer
cost = paddle.layer.lambda_cost( cost = paddle.layer.lambda_cost(
input=output, score=label, NDCG_num=6, max_sort_size=-1) input=output, score=label, NDCG_num=6, max_sort_size=-1)
return cost, output return cost, output
...@@ -39,7 +58,7 @@ def train_lambdaRank(num_passes): ...@@ -39,7 +58,7 @@ def train_lambdaRank(num_passes):
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
test_reader = paddle.batch(fill_default_test, batch_size=32) test_reader = paddle.batch(fill_default_test, batch_size=32)
# mq2007 input_dim = 46, dense format # mq2007 input_dim = 46, dense format
input_dim = 46 input_dim = 46
cost, output = lambdaRank(input_dim) cost, output = lambdaRank(input_dim)
parameters = paddle.parameters.create(cost) parameters = paddle.parameters.create(cost)
...@@ -83,20 +102,23 @@ def lambdaRank_infer(pass_id): ...@@ -83,20 +102,23 @@ def lambdaRank_infer(pass_id):
infer_query_id = None infer_query_id = None
infer_data = [] infer_data = []
infer_data_num = 1000 infer_data_num = 1
fill_default_test = functools.partial( fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise") paddle.dataset.mq2007.test, format="listwise")
for label, querylist in fill_default_test(): for label, querylist in fill_default_test():
infer_data.append(querylist) infer_data.append(querylist)
if len(infer_data) == infer_data_num: if len(infer_data) == infer_data_num:
break break
# predict score of infer_data document. Re-sort the document base on predict score
# in descending order. then we build the ranking documents
predicitons = paddle.infer( predicitons = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data) output_layer=output, parameters=parameters, input=infer_data)
for i, score in enumerate(predicitons): for i, score in enumerate(predicitons):
print score print i, score
if __name__ == '__main__': if __name__ == '__main__':
paddle.init(use_gpu=False, trainer_count=4) paddle.init(use_gpu=False, trainer_count=4)
train_lambdaRank(100) train_lambdaRank(2)
lambdaRank_infer(pass_id=2) lambdaRank_infer(pass_id=1)
import os, sys import os
import sys
import gzip import gzip
import functools import functools
import paddle.v2 as paddle import paddle.v2 as paddle
...@@ -37,7 +38,7 @@ def half_ranknet(name_prefix, input_dim): ...@@ -37,7 +38,7 @@ def half_ranknet(name_prefix, input_dim):
def ranknet(input_dim): def ranknet(input_dim):
# label layer # label layer
label = paddle.layer.data("label", paddle.data_type.integer_value(1)) label = paddle.layer.data("label", paddle.data_type.dense_vector(1))
# reuse the parameter in half_ranknet # reuse the parameter in half_ranknet
output_left = half_ranknet("left", input_dim) output_left = half_ranknet("left", input_dim)
...@@ -56,7 +57,7 @@ def train_ranknet(num_passes): ...@@ -56,7 +57,7 @@ def train_ranknet(num_passes):
batch_size=100) batch_size=100)
test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
# mq2007 feature_dim = 46, dense format # mq2007 feature_dim = 46, dense format
# fc hidden_dim = 128 # fc hidden_dim = 128
feature_dim = 46 feature_dim = 46
cost = ranknet(feature_dim) cost = ranknet(feature_dim)
...@@ -106,10 +107,9 @@ def ranknet_infer(pass_id): ...@@ -106,10 +107,9 @@ def ranknet_infer(pass_id):
gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1))) gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1)))
# load data of same query and relevance documents, need ranknet to rank these candidates # load data of same query and relevance documents, need ranknet to rank these candidates
infer_query_id = None infer_query_id = []
infer_data = [] infer_data = []
infer_score_list = [] infer_doc_index = []
infer_data_num = 1000
# convert to mq2007 built-in data format # convert to mq2007 built-in data format
# <query_id> <relevance_score> <feature_vector> # <query_id> <relevance_score> <feature_vector>
...@@ -117,17 +117,19 @@ def ranknet_infer(pass_id): ...@@ -117,17 +117,19 @@ def ranknet_infer(pass_id):
paddle.dataset.mq2007.test, format="plain_txt") paddle.dataset.mq2007.test, format="plain_txt")
for query_id, relevance_score, feature_vector in plain_txt_test(): for query_id, relevance_score, feature_vector in plain_txt_test():
if infer_query_id == None: infer_query_id.append(query_id)
infer_query_id = query_id
elif infer_query_id != query_id:
break
infer_data.append(feature_vector) infer_data.append(feature_vector)
predicitons = paddle.infer(
# predict score of infer_data document. Re-sort the document base on predict score
# in descending order. then we build the ranking documents
scores = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data) output_layer=output, parameters=parameters, input=infer_data)
for query_id, score in zip(infer_query_id, scores):
print "query_id : ", query_id, " ranknet rank document order : ", score
if __name__ == '__main__': if __name__ == '__main__':
paddle.init(use_gpu=False, trainer_count=4) paddle.init(use_gpu=False, trainer_count=4)
pass_num = 10 pass_num = 2
train_ranknet(pass_num) train_ranknet(pass_num)
ranknet_infer(pass_id=pass_num - 1) ranknet_infer(pass_id=pass_num - 1)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册