提交 eb88169e 编写于 作者: C caoying03

fix LTR example.

上级 ede5a045
...@@ -3,10 +3,14 @@ import sys ...@@ -3,10 +3,14 @@ import sys
import gzip import gzip
import functools import functools
import argparse import argparse
import logging
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def lambda_rank(input_dim, is_infer): def lambda_rank(input_dim, is_infer):
""" """
...@@ -26,43 +30,39 @@ def lambda_rank(input_dim, is_infer): ...@@ -26,43 +30,39 @@ def lambda_rank(input_dim, is_infer):
data = paddle.layer.data("data", data = paddle.layer.data("data",
paddle.data_type.dense_vector_sequence(input_dim)) paddle.data_type.dense_vector_sequence(input_dim))
# Define hidden layer. # Define the hidden layer.
hd1 = paddle.layer.fc( hd1 = paddle.layer.fc(input=data,
input=data, size=128,
size=128, act=paddle.activation.Tanh(),
act=paddle.activation.Tanh(), param_attr=paddle.attr.Param(initial_std=0.01))
param_attr=paddle.attr.Param(initial_std=0.01))
hd2 = paddle.layer.fc(input=hd1,
hd2 = paddle.layer.fc( size=10,
input=hd1, act=paddle.activation.Tanh(),
size=10, param_attr=paddle.attr.Param(initial_std=0.01))
act=paddle.activation.Tanh(), output = paddle.layer.fc(input=hd2,
param_attr=paddle.attr.Param(initial_std=0.01)) size=1,
output = paddle.layer.fc( act=paddle.activation.Linear(),
input=hd2, param_attr=paddle.attr.Param(initial_std=0.01))
size=1,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(initial_std=0.01))
if not is_infer: if not is_infer:
# Define evaluator. # Define the cost layer.
evaluator = paddle.evaluator.auc(input=output, label=label)
# Define cost layer.
cost = paddle.layer.lambda_cost( cost = paddle.layer.lambda_cost(
input=output, score=label, NDCG_num=6, max_sort_size=-1) input=output, score=label, NDCG_num=6, max_sort_size=-1)
return cost, output return cost, output
return output return output
def train_lambda_rank(num_passes): def lambda_rank_train(num_passes, model_save_dir):
# The input for LambdaRank is a sequence. # The input for LambdaRank must be a sequence.
fill_default_train = functools.partial( fill_default_train = functools.partial(
paddle.dataset.mq2007.train, format="listwise") paddle.dataset.mq2007.train, format="listwise")
fill_default_test = functools.partial( fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise") paddle.dataset.mq2007.test, format="listwise")
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) paddle.reader.shuffle(
fill_default_train, buf_size=100), batch_size=32)
test_reader = paddle.batch(fill_default_test, batch_size=32) test_reader = paddle.batch(fill_default_test, batch_size=32)
# Training dataset: mq2007, input_dim = 46, dense format. # Training dataset: mq2007, input_dim = 46, dense format.
...@@ -78,13 +78,15 @@ def train_lambda_rank(num_passes): ...@@ -78,13 +78,15 @@ def train_lambda_rank(num_passes):
# Define end batch and end pass event handler. # Define end batch and end pass event handler.
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id, logger.info("Pass %d Batch %d Cost %.9f" %
event.cost) (event.pass_id, event.batch_id, event.cost))
if isinstance(event, paddle.event.EndPass): if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding) result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) logger.info("\nTest with Pass %d, %s" %
with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id), (event.pass_id, result.metrics))
"w") as f: with gzip.open(
os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz"
% (event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f) trainer.save_parameter_to_tar(f)
feeding = {"label": 0, "data": 1} feeding = {"label": 0, "data": 1}
...@@ -95,17 +97,17 @@ def train_lambda_rank(num_passes): ...@@ -95,17 +97,17 @@ def train_lambda_rank(num_passes):
num_passes=num_passes) num_passes=num_passes)
def lambda_rank_infer(pass_id): def lambda_rank_infer(test_model_path):
"""LambdaRank model inference interface. """LambdaRank model inference interface.
Parameters: Parameters:
pass_id : inference model in pass_id test_model_path : The path of the trained model.
""" """
print "Begin to Infer..." logger.info("Begin to Infer...")
input_dim = 46 input_dim = 46
output = lambda_rank(input_dim, is_infer=True) output = lambda_rank(input_dim, is_infer=True)
parameters = paddle.parameters.Parameters.from_tar( parameters = paddle.parameters.Parameters.from_tar(
gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1))) gzip.open(test_model_path))
infer_query_id = None infer_query_id = None
infer_data = [] infer_data = []
...@@ -128,15 +130,51 @@ def lambda_rank_infer(pass_id): ...@@ -128,15 +130,51 @@ def lambda_rank_infer(pass_id):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='LambdaRank demo') parser = argparse.ArgumentParser(
parser.add_argument("--run_type", type=str, help="run type is train|infer") description="PaddlePaddle LambdaRank example.")
parser.add_argument(
"--run_type",
type=str,
help=("A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."),
default="train")
parser.add_argument( parser.add_argument(
"--num_passes", "--num_passes",
type=int, type=int,
help="The Num of passes in train| infer pass number of model.") help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
parser.add_argument(
"--test_model_path",
type=str,
required=False,
help=("This parameter works only in inferring task to "
"specify path of a trained model."),
default="")
args = parser.parse_args() args = parser.parse_args()
paddle.init(use_gpu=False, trainer_count=1) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
if args.run_type == "train": if args.run_type == "train":
train_lambda_rank(args.num_passes) lambda_rank_train(args.num_passes, args.model_save_dir)
elif args.run_type == "infer": elif args.run_type == "infer":
lambda_rank_infer(pass_id=args.num_passes - 1) assert os.path.exists(args.test_model_path), (
"The trained model does not exit. Please set a correct path.")
lambda_rank_infer(args.test_model_path)
else:
logger.fatal(("A wrong value for parameter run type. "
"Available options are: train or infer."))
...@@ -10,7 +10,7 @@ def ndcg(score_list): ...@@ -10,7 +10,7 @@ def ndcg(score_list):
score_list: np.array, shape=(sample_num,1) score_list: np.array, shape=(sample_num,1)
e.g. predict rank score list : e.g. predict rank score list :
>>> scores = [3, 2, 3, 0, 1, 2] >>> scores = [3, 2, 3, 0, 1, 2]
>>> ndcg_score = ndcg(scores) >>> ndcg_score = ndcg(scores)
""" """
......
...@@ -2,15 +2,23 @@ import os ...@@ -2,15 +2,23 @@ import os
import sys import sys
import gzip import gzip
import functools import functools
import paddle.v2 as paddle
import numpy as np
from metrics import ndcg
import argparse import argparse
import logging
import numpy as np
import paddle.v2 as paddle
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
# ranknet is the classic pairwise learning to rank algorithm # ranknet is the classic pairwise learning to rank algorithm
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf # http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
def score_diff(right_score, left_score):
return np.average(np.abs(right_score - left_score))
def half_ranknet(name_prefix, input_dim): def half_ranknet(name_prefix, input_dim):
""" """
parameter in same name will be shared in paddle framework, parameter in same name will be shared in paddle framework,
...@@ -19,18 +27,21 @@ def half_ranknet(name_prefix, input_dim): ...@@ -19,18 +27,21 @@ def half_ranknet(name_prefix, input_dim):
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
""" """
# data layer # data layer
data = paddle.layer.data(name_prefix + "/data", data = paddle.layer.data(name_prefix + "_data",
paddle.data_type.dense_vector(input_dim)) paddle.data_type.dense_vector(input_dim))
# hidden layer # hidden layer
hd1 = paddle.layer.fc( hd1 = paddle.layer.fc(
input=data, input=data,
name=name_prefix + "_hidden",
size=10, size=10,
act=paddle.activation.Tanh(), act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1")) param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
# fully connect layer/ output layer
# fully connected layer and output layer
output = paddle.layer.fc( output = paddle.layer.fc(
input=hd1, input=hd1,
name=name_prefix + "_score",
size=1, size=1,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(initial_std=0.01, name="output")) param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
...@@ -45,14 +56,13 @@ def ranknet(input_dim): ...@@ -45,14 +56,13 @@ def ranknet(input_dim):
output_left = half_ranknet("left", input_dim) output_left = half_ranknet("left", input_dim)
output_right = half_ranknet("right", input_dim) output_right = half_ranknet("right", input_dim)
evaluator = paddle.evaluator.auc(input=output_left, label=label)
# rankcost layer # rankcost layer
cost = paddle.layer.rank_cost( cost = paddle.layer.rank_cost(
name="cost", left=output_left, right=output_right, label=label) name="cost", left=output_left, right=output_right, label=label)
return cost return cost
def train_ranknet(num_passes): def ranknet_train(num_passes, model_save_dir):
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
batch_size=100) batch_size=100)
...@@ -70,22 +80,28 @@ def train_ranknet(num_passes): ...@@ -70,22 +80,28 @@ def train_ranknet(num_passes):
update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) update_equation=paddle.optimizer.Adam(learning_rate=2e-4))
# Define the input data order # Define the input data order
feeding = {"label": 0, "left/data": 1, "right/data": 2} feeding = {"label": 0, "left_data": 1, "right_data": 2}
# Define end batch and end pass event handler # Define end batch and end pass event handler
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0: if event.batch_id % 25 == 0:
print "Pass %d Batch %d Cost %.9f" % ( diff = score_diff(
event.pass_id, event.batch_id, event.cost) event.gm.getLayerOutputs("right_score")["right_score"][
else: "value"],
sys.stdout.write(".") event.gm.getLayerOutputs("left_score")["left_score"][
sys.stdout.flush() "value"])
logger.info(("Pass %d Batch %d : Cost %.6f, "
"average absolute diff scores: %.6f") %
(event.pass_id, event.batch_id, event.cost, diff))
if isinstance(event, paddle.event.EndPass): if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding) result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) logger.info("\nTest with Pass %d, %s" %
with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id), (event.pass_id, result.metrics))
"w") as f: with gzip.open(
os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" %
(event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f) trainer.save_parameter_to_tar(f)
trainer.train( trainer.train(
...@@ -95,18 +111,17 @@ def train_ranknet(num_passes): ...@@ -95,18 +111,17 @@ def train_ranknet(num_passes):
num_passes=num_passes) num_passes=num_passes)
def ranknet_infer(pass_id): def ranknet_infer(model_path):
""" """
load the trained model. And predict with plain txt input load the trained model. And predict with plain txt input
""" """
print "Begin to Infer..." logger.info("Begin to Infer...")
feature_dim = 46 feature_dim = 46
# we just need half_ranknet to predict a rank score, # we just need half_ranknet to predict a rank score,
# which can be used in sort documents # which can be used in sort documents
output = half_ranknet("infer", feature_dim) output = half_ranknet("infer", feature_dim)
parameters = paddle.parameters.Parameters.from_tar( parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
gzip.open("ranknet_params_%d.tar.gz" % (pass_id)))
# load data of same query and relevance documents, # load data of same query and relevance documents,
# need ranknet to rank these candidates # need ranknet to rank these candidates
...@@ -133,16 +148,55 @@ def ranknet_infer(pass_id): ...@@ -133,16 +148,55 @@ def ranknet_infer(pass_id):
print "query_id : ", query_id, " ranknet rank document order : ", score print "query_id : ", query_id, " ranknet rank document order : ", score
if __name__ == '__main__': if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Ranknet demo') parser = argparse.ArgumentParser(
parser.add_argument("--run_type", type=str, help="run type is train|infer") description="PaddlePaddle RankNet example.")
parser.add_argument(
"--run_type",
type=str,
help=("A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."),
default="train")
parser.add_argument( parser.add_argument(
"--num_passes", "--num_passes",
type=int, type=int,
help="num of passes in train| infer pass number of model") help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
parser.add_argument(
"--test_model_path",
type=str,
required=False,
help=("This parameter works only in inferring task to "
"specify path of a trained model."),
default="")
args = parser.parse_args() args = parser.parse_args()
paddle.init(use_gpu=False, trainer_count=4) if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
if args.run_type == "train": if args.run_type == "train":
train_ranknet(args.num_passes) ranknet_train(args.num_passes, args.model_save_dir)
elif args.run_type == "infer": elif args.run_type == "infer":
ranknet_infer(pass_id=args.pass_num - 1) assert os.path.exists(
args.test_model_path), "The trained model does not exit."
ranknet_infer(args.test_model_path)
else:
logger.fatal(("A wrong value for parameter run type. "
"Available options are: train or infer."))
#!/bin/sh
python lambda_rank.py \
--run_type="train" \
--num_passes=10 \
2>&1 | tee lambdarank_train.log
python lambda_rank.py \
--run_type="infer" \
--num_passes=10 \
2>&1 | tee lambdarank_infer.log
#!/bin/sh
python ranknet.py \
--run_type="train" \
--num_passes=10 \
2>&1 | tee ranknet_train.log
python ranknet.py \
--run_type="infer" \
--num_passes=10 \
2>&1 | tee ranknet_infer.log
...@@ -9,60 +9,60 @@ logger.setLevel(logging.INFO) ...@@ -9,60 +9,60 @@ logger.setLevel(logging.INFO)
def parse_train_cmd(): def parse_train_cmd():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="PaddlePaddle text classification demo") description="PaddlePaddle text classification example.")
parser.add_argument( parser.add_argument(
"--nn_type", "--nn_type",
type=str, type=str,
help="define which type of network to use, available: [dnn, cnn]", help=("A flag that defines which type of network to use, "
"available: [dnn, cnn]."),
default="dnn") default="dnn")
parser.add_argument( parser.add_argument(
"--train_data_dir", "--train_data_dir",
type=str, type=str,
required=False, required=False,
help=("path of training dataset (default: None). " help=("The path of training dataset (default: None). If this parameter "
"if this parameter is not set, " "is not set, paddle.dataset.imdb will be used."),
"paddle.dataset.imdb will be used."),
default=None) default=None)
parser.add_argument( parser.add_argument(
"--test_data_dir", "--test_data_dir",
type=str, type=str,
required=False, required=False,
help=("path of testing dataset (default: None). " help=("The path of testing dataset (default: None). If this parameter "
"if this parameter is not set, " "is not set, paddle.dataset.imdb will be used."),
"paddle.dataset.imdb will be used."),
default=None) default=None)
parser.add_argument( parser.add_argument(
"--word_dict", "--word_dict",
type=str, type=str,
required=False, required=False,
help=("path of word dictionary (default: None)." help=("The path of word dictionary (default: None). If this parameter "
"if this parameter is not set, paddle.dataset.imdb will be used." "is not set, paddle.dataset.imdb will be used. If this parameter "
"if this parameter is set, but the file does not exist, " "is set, but the file does not exist, word dictionay "
"word dictionay will be built from " "will be built from the training data automatically."),
"the training data automatically."),
default=None) default=None)
parser.add_argument( parser.add_argument(
"--label_dict", "--label_dict",
type=str, type=str,
required=False, required=False,
help=("path of label dictionay (default: None)." help=("The path of label dictionay (default: None).If this parameter "
"if this parameter is not set, paddle.dataset.imdb will be used." "is not set, paddle.dataset.imdb will be used. If this parameter "
"if this parameter is set, but the file does not exist, " "is set, but the file does not exist, word dictionay "
"word dictionay will be built from " "will be built from the training data automatically."),
"the training data automatically."),
default=None) default=None)
parser.add_argument( parser.add_argument(
"--batch_size", "--batch_size",
type=int, type=int,
default=32, default=32,
help="the number of training examples in one forward/backward pass") help="The number of training examples in one forward/backward pass.")
parser.add_argument( parser.add_argument(
"--num_passes", type=int, default=10, help="number of passes to train") "--num_passes",
type=int,
default=10,
help="The number of passes to train the model.")
parser.add_argument( parser.add_argument(
"--model_save_dir", "--model_save_dir",
type=str, type=str,
required=False, required=False,
help=("path to save the trained models."), help=("The path to save the trained models."),
default="models") default="models")
return parser.parse_args() return parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册