提交 65e8a7e8 编写于 作者: W wangmeng28

restructure the code of ltr

上级 82611c75
......@@ -96,7 +96,7 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}} = \frac{1}{2}(1-S_{i,j})-\fra
训练`RankNet`模型在命令行执行:
```bash
python ranknet.py
python train.py --model_type ranknet
```
初次执行会自动下载数据,训练RankNet模型,并将每个轮次的模型参数存储下来。
......@@ -104,9 +104,7 @@ python ranknet.py
使用训练好的`RankNet`模型继续进行预测,在命令行执行:
```bash
python ranknet.py \
--run_type infer \
--test_model_path models/ranknet_params_0.tar.gz
python infer.py --model_type ranknet --test_model_path models/ranknet_params_0.tar.gz
```
本例提供了rankNet模型的训练和预测两个部分。完成训练后的模型分为拓扑结构(需要注意`rank_cost`不是模型拓扑结构的一部分)和模型参数文件两部分。在本例子中复用了`ranknet`训练时的模型拓扑结构`half_ranknet`,模型参数从外存中加载。模型预测的输入为单个文档的特征向量,模型会给出相关性得分。将预测得分排序即可得到最终的文档相关性排序结果。
......@@ -193,7 +191,7 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}}=-\frac{\sigma }{1+e^{\sigma (
训练`LambdaRank`模型在命令行执行:
```bash
python lambda_rank.py
python train.py --model_type lambdarank
```
初次运行脚本会自动下载数据训练LambdaRank模型,并将每个轮次的模型存储下来。
......@@ -203,9 +201,7 @@ LambdaRank模型预测过程和RankNet相同。预测时的模型拓扑结构复
使用训练好的`LambdaRank`模型继续进行预测,在命令行执行:
```bash
python lambda_rank.py \
--run_type infer \
--test_model_path models/lambda_rank_params_0.tar.gz
python infer.py --model_type lambdarank --test_model_path models/lambda_rank_params_0.tar.gz
```
## 自定义 LambdaRank数据
......
import os
import gzip
import functools
import argparse
import paddle.v2 as paddle
from ranknet import half_ranknet
from lambda_rank import lambda_rank
def ranknet_infer(input_dim, model_path):
"""
RankNet model inference interface.
"""
# we just need half_ranknet to predict a rank score,
# which can be used in sort documents
output = half_ranknet("right", input_dim)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
# load data of same query and relevance documents,
# need ranknet to rank these candidates
infer_query_id = []
infer_data = []
infer_doc_index = []
# convert to mq2007 built-in data format
# <query_id> <relevance_score> <feature_vector>
plain_txt_test = functools.partial(
paddle.dataset.mq2007.test, format="plain_txt")
for query_id, relevance_score, feature_vector in plain_txt_test():
infer_query_id.append(query_id)
infer_data.append([feature_vector])
# predict score of infer_data document.
# Re-sort the document base on predict score
# in descending order. then we build the ranking documents
scores = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for query_id, score in zip(infer_query_id, scores):
print "query_id : ", query_id, " score : ", score
def lambda_rank_infer(input_dim, model_path):
"""
LambdaRank model inference interface.
"""
output = lambda_rank(input_dim, is_infer=True)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
infer_query_id = None
infer_data = []
infer_data_num = 1
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
for label, querylist in fill_default_test():
infer_data.append([querylist])
if len(infer_data) == infer_data_num:
break
# Predict score of infer_data document.
# Re-sort the document base on predict score.
# In descending order. then we build the ranking documents.
predicitons = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for i, score in enumerate(predicitons):
print i, score
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle learning to rank example.")
parser.add_argument(
"--model_type",
type=str,
help=("A flag indicating to run the RankNet or the LambdaRank model. "
"Available options are: ranknet or lambdarank."),
default="ranknet")
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--test_model_path",
type=str,
required=True,
help=("The path of a trained model."))
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
assert os.path.exists(args.test_model_path), (
"The trained model does not exit. Please set a correct path.")
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
# Training dataset: mq2007, input_dim = 46, dense format.
input_dim = 46
if args.model_type == "ranknet":
ranknet_infer(input_dim, args.test_model_path)
elif args.model_type == "lambdarank":
lambda_rank_infer(input_dim, args.test_model_path)
else:
logger.fatal(("A wrong value for parameter model type. "
"Available options are: ranknet or lambdarank."))
import os
import sys
import gzip
import functools
import argparse
import logging
import numpy as np
"""
LambdaRank is a listwise rank model.
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
"""
import paddle.v2 as paddle
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def lambda_rank(input_dim, is_infer):
def lambda_rank(input_dim, is_infer=False):
"""
LambdaRank is a listwise rank model, the input data and label
must be sequences.
The input data and label for LambdaRank must be sequences.
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
parameters :
input_dim, one document's dense feature vector dimension
The format of the dense_vector_sequence is as follows:
[[f, ...], [f, ...], ...], f is a float or an int number
"""
if not is_infer:
label = paddle.layer.data("label",
paddle.data_type.dense_vector_sequence(1))
data = paddle.layer.data("data",
paddle.data_type.dense_vector_sequence(input_dim))
......@@ -49,134 +37,11 @@ def lambda_rank(input_dim, is_infer):
param_attr=paddle.attr.Param(initial_std=0.01))
if not is_infer:
# Define the cost layer.
label = paddle.layer.data("label",
paddle.data_type.dense_vector_sequence(1))
cost = paddle.layer.lambda_cost(
input=output, score=label, NDCG_num=6, max_sort_size=-1)
return cost, output
return output
def lambda_rank_train(num_passes, model_save_dir):
# The input for LambdaRank must be a sequence.
fill_default_train = functools.partial(
paddle.dataset.mq2007.train, format="listwise")
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
train_reader = paddle.batch(
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
test_reader = paddle.batch(fill_default_test, batch_size=32)
# Training dataset: mq2007, input_dim = 46, dense format.
input_dim = 46
cost, output = lambda_rank(input_dim, is_infer=False)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
# Define end batch and end pass event handler.
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
logger.info("Pass %d Batch %d Cost %.9f" %
(event.pass_id, event.batch_id, event.cost))
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("\nTest with Pass %d, %s" %
(event.pass_id, result.metrics))
with gzip.open(
os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz"
% (event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f)
feeding = {"label": 0, "data": 1}
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_passes)
def lambda_rank_infer(test_model_path):
"""LambdaRank model inference interface.
Parameters:
test_model_path : The path of the trained model.
"""
logger.info("Begin to Infer...")
input_dim = 46
output = lambda_rank(input_dim, is_infer=True)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(test_model_path))
infer_query_id = None
infer_data = []
infer_data_num = 1
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
for label, querylist in fill_default_test():
infer_data.append([querylist])
if len(infer_data) == infer_data_num:
break
# Predict score of infer_data document.
# Re-sort the document base on predict score.
# In descending order. then we build the ranking documents.
predicitons = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for i, score in enumerate(predicitons):
print i, score
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="PaddlePaddle LambdaRank example.")
parser.add_argument(
"--run_type",
type=str,
help=("A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."),
default="train")
parser.add_argument(
"--num_passes",
type=int,
help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
parser.add_argument(
"--test_model_path",
type=str,
required=False,
help=("This parameter works only in inferring task to "
"specify path of a trained model."),
default="")
args = parser.parse_args()
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
if args.run_type == "train":
lambda_rank_train(args.num_passes, args.model_save_dir)
elif args.run_type == "infer":
assert os.path.exists(args.test_model_path), (
"The trained model does not exit. Please set a correct path.")
lambda_rank_infer(args.test_model_path)
return cost
else:
logger.fatal(("A wrong value for parameter run type. "
"Available options are: train or infer."))
return output
import numpy as np
import unittest
def ndcg(score_list):
"""
measure the ndcg score of order list
https://en.wikipedia.org/wiki/Discounted_cumulative_gain
parameter:
score_list: np.array, shape=(sample_num,1)
e.g. predict rank score list :
>>> scores = [3, 2, 3, 0, 1, 2]
>>> ndcg_score = ndcg(scores)
"""
def dcg(score_list):
n = len(score_list)
cost = .0
for i in range(n):
cost += float(np.power(2, score_list[i])) / np.log((i + 1) + 1)
return cost
dcg_cost = dcg(score_list)
score_ranking = sorted(score_list, reverse=True)
ideal_cost = dcg(score_ranking)
return dcg_cost / ideal_cost
class TestNDCG(unittest.TestCase):
def test_array(self):
a = [3, 2, 3, 0, 1, 2]
value = ndcg(a)
self.assertAlmostEqual(0.9583, value, places=3)
if __name__ == '__main__':
unittest.main()
import os
import sys
import gzip
import functools
import argparse
import logging
import numpy as np
"""
ranknet is the classic pairwise learning to rank algorithm
http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
"""
import paddle.v2 as paddle
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
# ranknet is the classic pairwise learning to rank algorithm
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
def score_diff(right_score, left_score):
return np.average(np.abs(right_score - left_score))
def half_ranknet(name_prefix, input_dim):
"""
......@@ -60,142 +46,3 @@ def ranknet(input_dim):
cost = paddle.layer.rank_cost(
name="cost", left=output_left, right=output_right, label=label)
return cost
def ranknet_train(num_passes, model_save_dir):
train_reader = paddle.batch(
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
batch_size=100)
test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
# mq2007 feature_dim = 46, dense format
# fc hidden_dim = 128
feature_dim = 46
cost = ranknet(feature_dim)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=2e-4))
# Define the input data order
feeding = {"label": 0, "left_data": 1, "right_data": 2}
# Define end batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 25 == 0:
diff = score_diff(
event.gm.getLayerOutputs("left_score")["left_score"][
"value"],
event.gm.getLayerOutputs("right_score")["right_score"][
"value"])
logger.info(("Pass %d Batch %d : Cost %.6f, "
"average absolute diff scores: %.6f") %
(event.pass_id, event.batch_id, event.cost, diff))
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("\nTest with Pass %d, %s" %
(event.pass_id, result.metrics))
with gzip.open(
os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" %
(event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f)
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_passes)
def ranknet_infer(model_path):
"""
load the trained model. And predict with plain txt input
"""
logger.info("Begin to Infer...")
feature_dim = 46
# we just need half_ranknet to predict a rank score,
# which can be used in sort documents
output = half_ranknet("right", feature_dim)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
# load data of same query and relevance documents,
# need ranknet to rank these candidates
infer_query_id = []
infer_data = []
infer_doc_index = []
# convert to mq2007 built-in data format
# <query_id> <relevance_score> <feature_vector>
plain_txt_test = functools.partial(
paddle.dataset.mq2007.test, format="plain_txt")
for query_id, relevance_score, feature_vector in plain_txt_test():
infer_query_id.append(query_id)
infer_data.append([feature_vector])
# predict score of infer_data document.
# Re-sort the document base on predict score
# in descending order. then we build the ranking documents
scores = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for query_id, score in zip(infer_query_id, scores):
print "query_id : ", query_id, " score : ", score
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="PaddlePaddle RankNet example.")
parser.add_argument(
"--run_type",
type=str,
help=("A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."),
default="train")
parser.add_argument(
"--num_passes",
type=int,
help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
parser.add_argument(
"--test_model_path",
type=str,
required=False,
help=("This parameter works only in inferring task to "
"specify path of a trained model."),
default="")
args = parser.parse_args()
if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
if args.run_type == "train":
ranknet_train(args.num_passes, args.model_save_dir)
elif args.run_type == "infer":
assert os.path.exists(
args.test_model_path), "The trained model does not exit."
ranknet_infer(args.test_model_path)
else:
logger.fatal(("A wrong value for parameter run type. "
"Available options are: train or infer."))
import os
import gzip
import functools
import argparse
import logging
import numpy as np
import paddle.v2 as paddle
from ranknet import ranknet
from lambda_rank import lambda_rank
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def ranknet_train(input_dim, num_passes, model_save_dir):
train_reader = paddle.batch(
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
batch_size=100)
test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
cost = ranknet(input_dim)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=2e-4))
feeding = {"label": 0, "left_data": 1, "right_data": 2}
def score_diff(right_score, left_score):
return np.average(np.abs(right_score - left_score))
# Define end batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 25 == 0:
diff = score_diff(
event.gm.getLayerOutputs("left_score")["left_score"][
"value"],
event.gm.getLayerOutputs("right_score")["right_score"][
"value"])
logger.info(("Pass %d Batch %d : Cost %.6f, "
"average absolute diff scores: %.6f") %
(event.pass_id, event.batch_id, event.cost, diff))
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("\nTest with Pass %d, %s" %
(event.pass_id, result.metrics))
with gzip.open(
os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" %
(event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f)
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_passes)
def lambda_rank_train(input_dim, num_passes, model_save_dir):
# The input for LambdaRank must be a sequence.
fill_default_train = functools.partial(
paddle.dataset.mq2007.train, format="listwise")
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
train_reader = paddle.batch(
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
test_reader = paddle.batch(fill_default_test, batch_size=32)
cost = lambda_rank(input_dim)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
feeding = {"label": 0, "data": 1}
# Define end batch and end pass event handler.
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
logger.info("Pass %d Batch %d Cost %.9f" %
(event.pass_id, event.batch_id, event.cost))
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("\nTest with Pass %d, %s" %
(event.pass_id, result.metrics))
with gzip.open(
os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz"
% (event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f)
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_passes)
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle learning to rank example.")
parser.add_argument(
"--model_type",
type=str,
help=("A flag indicating to run the RankNet or the LambdaRank model. "
"Available options are: ranknet or lambdarank."),
default="ranknet")
parser.add_argument(
"--num_passes",
type=int,
help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
# Training dataset: mq2007, input_dim = 46, dense format.
input_dim = 46
if args.model_type == "ranknet":
ranknet_train(input_dim, args.num_passes, args.model_save_dir)
elif args.model_type == "lambdarank":
lambda_rank_train(input_dim, args.num_passes, args.model_save_dir)
else:
logger.fatal(("A wrong value for parameter model type. "
"Available options are: ranknet or lambdarank."))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册