提交 eb88169e 编写于 作者: C caoying03

fix LTR example.

上级 ede5a045
......@@ -3,10 +3,14 @@ import sys
import gzip
import functools
import argparse
import logging
import numpy as np
import paddle.v2 as paddle
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def lambda_rank(input_dim, is_infer):
"""
......@@ -26,43 +30,39 @@ def lambda_rank(input_dim, is_infer):
data = paddle.layer.data("data",
paddle.data_type.dense_vector_sequence(input_dim))
# Define hidden layer.
hd1 = paddle.layer.fc(
input=data,
# Define the hidden layer.
hd1 = paddle.layer.fc(input=data,
size=128,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01))
hd2 = paddle.layer.fc(
input=hd1,
hd2 = paddle.layer.fc(input=hd1,
size=10,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01))
output = paddle.layer.fc(
input=hd2,
output = paddle.layer.fc(input=hd2,
size=1,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(initial_std=0.01))
if not is_infer:
# Define evaluator.
evaluator = paddle.evaluator.auc(input=output, label=label)
# Define cost layer.
# Define the cost layer.
cost = paddle.layer.lambda_cost(
input=output, score=label, NDCG_num=6, max_sort_size=-1)
return cost, output
return output
def train_lambda_rank(num_passes):
# The input for LambdaRank is a sequence.
def lambda_rank_train(num_passes, model_save_dir):
# The input for LambdaRank must be a sequence.
fill_default_train = functools.partial(
paddle.dataset.mq2007.train, format="listwise")
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
train_reader = paddle.batch(
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
paddle.reader.shuffle(
fill_default_train, buf_size=100), batch_size=32)
test_reader = paddle.batch(fill_default_test, batch_size=32)
# Training dataset: mq2007, input_dim = 46, dense format.
......@@ -78,13 +78,15 @@ def train_lambda_rank(num_passes):
# Define end batch and end pass event handler.
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id,
event.cost)
logger.info("Pass %d Batch %d Cost %.9f" %
(event.pass_id, event.batch_id, event.cost))
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id),
"w") as f:
logger.info("\nTest with Pass %d, %s" %
(event.pass_id, result.metrics))
with gzip.open(
os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz"
% (event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f)
feeding = {"label": 0, "data": 1}
......@@ -95,17 +97,17 @@ def train_lambda_rank(num_passes):
num_passes=num_passes)
def lambda_rank_infer(pass_id):
def lambda_rank_infer(test_model_path):
"""LambdaRank model inference interface.
Parameters:
pass_id : inference model in pass_id
test_model_path : The path of the trained model.
"""
print "Begin to Infer..."
logger.info("Begin to Infer...")
input_dim = 46
output = lambda_rank(input_dim, is_infer=True)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1)))
gzip.open(test_model_path))
infer_query_id = None
infer_data = []
......@@ -128,15 +130,51 @@ def lambda_rank_infer(pass_id):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='LambdaRank demo')
parser.add_argument("--run_type", type=str, help="run type is train|infer")
parser = argparse.ArgumentParser(
description="PaddlePaddle LambdaRank example.")
parser.add_argument(
"--run_type",
type=str,
help=("A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."),
default="train")
parser.add_argument(
"--num_passes",
type=int,
help="The Num of passes in train| infer pass number of model.")
help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
parser.add_argument(
"--test_model_path",
type=str,
required=False,
help=("This parameter works only in inferring task to "
"specify path of a trained model."),
default="")
args = parser.parse_args()
paddle.init(use_gpu=False, trainer_count=1)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
if args.run_type == "train":
train_lambda_rank(args.num_passes)
lambda_rank_train(args.num_passes, args.model_save_dir)
elif args.run_type == "infer":
lambda_rank_infer(pass_id=args.num_passes - 1)
assert os.path.exists(args.test_model_path), (
"The trained model does not exit. Please set a correct path.")
lambda_rank_infer(args.test_model_path)
else:
logger.fatal(("A wrong value for parameter run type. "
"Available options are: train or infer."))
......@@ -2,15 +2,23 @@ import os
import sys
import gzip
import functools
import paddle.v2 as paddle
import numpy as np
from metrics import ndcg
import argparse
import logging
import numpy as np
import paddle.v2 as paddle
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
# ranknet is the classic pairwise learning to rank algorithm
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
def score_diff(right_score, left_score):
return np.average(np.abs(right_score - left_score))
def half_ranknet(name_prefix, input_dim):
"""
parameter in same name will be shared in paddle framework,
......@@ -19,18 +27,21 @@ def half_ranknet(name_prefix, input_dim):
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
"""
# data layer
data = paddle.layer.data(name_prefix + "/data",
data = paddle.layer.data(name_prefix + "_data",
paddle.data_type.dense_vector(input_dim))
# hidden layer
hd1 = paddle.layer.fc(
input=data,
name=name_prefix + "_hidden",
size=10,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
# fully connect layer/ output layer
# fully connected layer and output layer
output = paddle.layer.fc(
input=hd1,
name=name_prefix + "_score",
size=1,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
......@@ -45,14 +56,13 @@ def ranknet(input_dim):
output_left = half_ranknet("left", input_dim)
output_right = half_ranknet("right", input_dim)
evaluator = paddle.evaluator.auc(input=output_left, label=label)
# rankcost layer
cost = paddle.layer.rank_cost(
name="cost", left=output_left, right=output_right, label=label)
return cost
def train_ranknet(num_passes):
def ranknet_train(num_passes, model_save_dir):
train_reader = paddle.batch(
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
batch_size=100)
......@@ -70,22 +80,28 @@ def train_ranknet(num_passes):
update_equation=paddle.optimizer.Adam(learning_rate=2e-4))
# Define the input data order
feeding = {"label": 0, "left/data": 1, "right/data": 2}
feeding = {"label": 0, "left_data": 1, "right_data": 2}
# Define end batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d Batch %d Cost %.9f" % (
event.pass_id, event.batch_id, event.cost)
else:
sys.stdout.write(".")
sys.stdout.flush()
if event.batch_id % 25 == 0:
diff = score_diff(
event.gm.getLayerOutputs("right_score")["right_score"][
"value"],
event.gm.getLayerOutputs("left_score")["left_score"][
"value"])
logger.info(("Pass %d Batch %d : Cost %.6f, "
"average absolute diff scores: %.6f") %
(event.pass_id, event.batch_id, event.cost, diff))
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id),
"w") as f:
logger.info("\nTest with Pass %d, %s" %
(event.pass_id, result.metrics))
with gzip.open(
os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" %
(event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f)
trainer.train(
......@@ -95,18 +111,17 @@ def train_ranknet(num_passes):
num_passes=num_passes)
def ranknet_infer(pass_id):
def ranknet_infer(model_path):
"""
load the trained model. And predict with plain txt input
"""
print "Begin to Infer..."
logger.info("Begin to Infer...")
feature_dim = 46
# we just need half_ranknet to predict a rank score,
# which can be used in sort documents
output = half_ranknet("infer", feature_dim)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open("ranknet_params_%d.tar.gz" % (pass_id)))
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
# load data of same query and relevance documents,
# need ranknet to rank these candidates
......@@ -133,16 +148,55 @@ def ranknet_infer(pass_id):
print "query_id : ", query_id, " ranknet rank document order : ", score
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Ranknet demo')
parser.add_argument("--run_type", type=str, help="run type is train|infer")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="PaddlePaddle RankNet example.")
parser.add_argument(
"--run_type",
type=str,
help=("A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."),
default="train")
parser.add_argument(
"--num_passes",
type=int,
help="num of passes in train| infer pass number of model")
help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
parser.add_argument(
"--test_model_path",
type=str,
required=False,
help=("This parameter works only in inferring task to "
"specify path of a trained model."),
default="")
args = parser.parse_args()
paddle.init(use_gpu=False, trainer_count=4)
if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
if args.run_type == "train":
train_ranknet(args.num_passes)
ranknet_train(args.num_passes, args.model_save_dir)
elif args.run_type == "infer":
ranknet_infer(pass_id=args.pass_num - 1)
assert os.path.exists(
args.test_model_path), "The trained model does not exit."
ranknet_infer(args.test_model_path)
else:
logger.fatal(("A wrong value for parameter run type. "
"Available options are: train or infer."))
#!/bin/sh
python lambda_rank.py \
--run_type="train" \
--num_passes=10 \
2>&1 | tee lambdarank_train.log
python lambda_rank.py \
--run_type="infer" \
--num_passes=10 \
2>&1 | tee lambdarank_infer.log
#!/bin/sh
python ranknet.py \
--run_type="train" \
--num_passes=10 \
2>&1 | tee ranknet_train.log
python ranknet.py \
--run_type="infer" \
--num_passes=10 \
2>&1 | tee ranknet_infer.log
......@@ -9,60 +9,60 @@ logger.setLevel(logging.INFO)
def parse_train_cmd():
parser = argparse.ArgumentParser(
description="PaddlePaddle text classification demo")
description="PaddlePaddle text classification example.")
parser.add_argument(
"--nn_type",
type=str,
help="define which type of network to use, available: [dnn, cnn]",
help=("A flag that defines which type of network to use, "
"available: [dnn, cnn]."),
default="dnn")
parser.add_argument(
"--train_data_dir",
type=str,
required=False,
help=("path of training dataset (default: None). "
"if this parameter is not set, "
"paddle.dataset.imdb will be used."),
help=("The path of training dataset (default: None). If this parameter "
"is not set, paddle.dataset.imdb will be used."),
default=None)
parser.add_argument(
"--test_data_dir",
type=str,
required=False,
help=("path of testing dataset (default: None). "
"if this parameter is not set, "
"paddle.dataset.imdb will be used."),
help=("The path of testing dataset (default: None). If this parameter "
"is not set, paddle.dataset.imdb will be used."),
default=None)
parser.add_argument(
"--word_dict",
type=str,
required=False,
help=("path of word dictionary (default: None)."
"if this parameter is not set, paddle.dataset.imdb will be used."
"if this parameter is set, but the file does not exist, "
"word dictionay will be built from "
"the training data automatically."),
help=("The path of word dictionary (default: None). If this parameter "
"is not set, paddle.dataset.imdb will be used. If this parameter "
"is set, but the file does not exist, word dictionay "
"will be built from the training data automatically."),
default=None)
parser.add_argument(
"--label_dict",
type=str,
required=False,
help=("path of label dictionay (default: None)."
"if this parameter is not set, paddle.dataset.imdb will be used."
"if this parameter is set, but the file does not exist, "
"word dictionay will be built from "
"the training data automatically."),
help=("The path of label dictionay (default: None).If this parameter "
"is not set, paddle.dataset.imdb will be used. If this parameter "
"is set, but the file does not exist, word dictionay "
"will be built from the training data automatically."),
default=None)
parser.add_argument(
"--batch_size",
type=int,
default=32,
help="the number of training examples in one forward/backward pass")
help="The number of training examples in one forward/backward pass.")
parser.add_argument(
"--num_passes", type=int, default=10, help="number of passes to train")
"--num_passes",
type=int,
default=10,
help="The number of passes to train the model.")
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("path to save the trained models."),
help=("The path to save the trained models."),
default="models")
return parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册