diff --git a/ltr/lambda_rank.py b/ltr/lambda_rank.py index 2314e3031ab5f1934c2d713bf037c3df52992b8e..49d643c0623a39685ff16bb9c5e93457deb24f08 100644 --- a/ltr/lambda_rank.py +++ b/ltr/lambda_rank.py @@ -3,10 +3,14 @@ import sys import gzip import functools import argparse +import logging import numpy as np import paddle.v2 as paddle +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + def lambda_rank(input_dim, is_infer): """ @@ -26,43 +30,39 @@ def lambda_rank(input_dim, is_infer): data = paddle.layer.data("data", paddle.data_type.dense_vector_sequence(input_dim)) - # Define hidden layer. - hd1 = paddle.layer.fc( - input=data, - size=128, - act=paddle.activation.Tanh(), - param_attr=paddle.attr.Param(initial_std=0.01)) - - hd2 = paddle.layer.fc( - input=hd1, - size=10, - act=paddle.activation.Tanh(), - param_attr=paddle.attr.Param(initial_std=0.01)) - output = paddle.layer.fc( - input=hd2, - size=1, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param(initial_std=0.01)) + # Define the hidden layer. + hd1 = paddle.layer.fc(input=data, + size=128, + act=paddle.activation.Tanh(), + param_attr=paddle.attr.Param(initial_std=0.01)) + + hd2 = paddle.layer.fc(input=hd1, + size=10, + act=paddle.activation.Tanh(), + param_attr=paddle.attr.Param(initial_std=0.01)) + output = paddle.layer.fc(input=hd2, + size=1, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(initial_std=0.01)) if not is_infer: - # Define evaluator. - evaluator = paddle.evaluator.auc(input=output, label=label) - # Define cost layer. + # Define the cost layer. cost = paddle.layer.lambda_cost( input=output, score=label, NDCG_num=6, max_sort_size=-1) return cost, output return output -def train_lambda_rank(num_passes): - # The input for LambdaRank is a sequence. +def lambda_rank_train(num_passes, model_save_dir): + # The input for LambdaRank must be a sequence. fill_default_train = functools.partial( paddle.dataset.mq2007.train, format="listwise") fill_default_test = functools.partial( paddle.dataset.mq2007.test, format="listwise") train_reader = paddle.batch( - paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) + paddle.reader.shuffle( + fill_default_train, buf_size=100), batch_size=32) test_reader = paddle.batch(fill_default_test, batch_size=32) # Training dataset: mq2007, input_dim = 46, dense format. @@ -78,13 +78,15 @@ def train_lambda_rank(num_passes): # Define end batch and end pass event handler. def event_handler(event): if isinstance(event, paddle.event.EndIteration): - print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id, - event.cost) + logger.info("Pass %d Batch %d Cost %.9f" % + (event.pass_id, event.batch_id, event.cost)) if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id), - "w") as f: + logger.info("\nTest with Pass %d, %s" % + (event.pass_id, result.metrics)) + with gzip.open( + os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz" + % (event.pass_id)), "w") as f: trainer.save_parameter_to_tar(f) feeding = {"label": 0, "data": 1} @@ -95,17 +97,17 @@ def train_lambda_rank(num_passes): num_passes=num_passes) -def lambda_rank_infer(pass_id): +def lambda_rank_infer(test_model_path): """LambdaRank model inference interface. Parameters: - pass_id : inference model in pass_id + test_model_path : The path of the trained model. """ - print "Begin to Infer..." + logger.info("Begin to Infer...") input_dim = 46 output = lambda_rank(input_dim, is_infer=True) parameters = paddle.parameters.Parameters.from_tar( - gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1))) + gzip.open(test_model_path)) infer_query_id = None infer_data = [] @@ -128,15 +130,51 @@ def lambda_rank_infer(pass_id): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='LambdaRank demo') - parser.add_argument("--run_type", type=str, help="run type is train|infer") + parser = argparse.ArgumentParser( + description="PaddlePaddle LambdaRank example.") + parser.add_argument( + "--run_type", + type=str, + help=("A flag indicating to run the training or the inferring task. " + "Available options are: train or infer."), + default="train") parser.add_argument( "--num_passes", type=int, - help="The Num of passes in train| infer pass number of model.") + help="The number of passes to train the model.", + default=10) + parser.add_argument( + "--use_gpu", + type=bool, + help="A flag indicating whether to use the GPU device in training.", + default=False) + parser.add_argument( + "--trainer_count", + type=int, + help="The thread number used in training.", + default=1) + parser.add_argument( + "--model_save_dir", + type=str, + required=False, + help=("The path to save the trained models."), + default="models") + parser.add_argument( + "--test_model_path", + type=str, + required=False, + help=("This parameter works only in inferring task to " + "specify path of a trained model."), + default="") + args = parser.parse_args() - paddle.init(use_gpu=False, trainer_count=1) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) if args.run_type == "train": - train_lambda_rank(args.num_passes) + lambda_rank_train(args.num_passes, args.model_save_dir) elif args.run_type == "infer": - lambda_rank_infer(pass_id=args.num_passes - 1) + assert os.path.exists(args.test_model_path), ( + "The trained model does not exit. Please set a correct path.") + lambda_rank_infer(args.test_model_path) + else: + logger.fatal(("A wrong value for parameter run type. " + "Available options are: train or infer.")) diff --git a/ltr/metrics.py b/ltr/metrics.py index a2bbf3feb3fb9a277b7409f73c722e717a0f675e..be1cc7083930f639e7190e5da2544c98e3e65d16 100644 --- a/ltr/metrics.py +++ b/ltr/metrics.py @@ -10,7 +10,7 @@ def ndcg(score_list): score_list: np.array, shape=(sample_num,1) e.g. predict rank score list : - >>> scores = [3, 2, 3, 0, 1, 2] + >>> scores = [3, 2, 3, 0, 1, 2] >>> ndcg_score = ndcg(scores) """ diff --git a/ltr/ranknet.py b/ltr/ranknet.py index 339bb34cfbb17931359399128fe81110fa15301e..89a2a2778d729a573a530bb59530a051fb11c374 100644 --- a/ltr/ranknet.py +++ b/ltr/ranknet.py @@ -2,15 +2,23 @@ import os import sys import gzip import functools -import paddle.v2 as paddle -import numpy as np -from metrics import ndcg import argparse +import logging +import numpy as np + +import paddle.v2 as paddle + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) # ranknet is the classic pairwise learning to rank algorithm # http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf +def score_diff(right_score, left_score): + return np.average(np.abs(right_score - left_score)) + + def half_ranknet(name_prefix, input_dim): """ parameter in same name will be shared in paddle framework, @@ -19,18 +27,21 @@ def half_ranknet(name_prefix, input_dim): https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md """ # data layer - data = paddle.layer.data(name_prefix + "/data", + data = paddle.layer.data(name_prefix + "_data", paddle.data_type.dense_vector(input_dim)) # hidden layer hd1 = paddle.layer.fc( input=data, + name=name_prefix + "_hidden", size=10, act=paddle.activation.Tanh(), param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1")) - # fully connect layer/ output layer + + # fully connected layer and output layer output = paddle.layer.fc( input=hd1, + name=name_prefix + "_score", size=1, act=paddle.activation.Linear(), param_attr=paddle.attr.Param(initial_std=0.01, name="output")) @@ -45,14 +56,13 @@ def ranknet(input_dim): output_left = half_ranknet("left", input_dim) output_right = half_ranknet("right", input_dim) - evaluator = paddle.evaluator.auc(input=output_left, label=label) # rankcost layer cost = paddle.layer.rank_cost( name="cost", left=output_left, right=output_right, label=label) return cost -def train_ranknet(num_passes): +def ranknet_train(num_passes, model_save_dir): train_reader = paddle.batch( paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), batch_size=100) @@ -70,22 +80,28 @@ def train_ranknet(num_passes): update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) # Define the input data order - feeding = {"label": 0, "left/data": 1, "right/data": 2} + feeding = {"label": 0, "left_data": 1, "right_data": 2} # Define end batch and end pass event handler def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d Batch %d Cost %.9f" % ( - event.pass_id, event.batch_id, event.cost) - else: - sys.stdout.write(".") - sys.stdout.flush() + if event.batch_id % 25 == 0: + diff = score_diff( + event.gm.getLayerOutputs("right_score")["right_score"][ + "value"], + event.gm.getLayerOutputs("left_score")["left_score"][ + "value"]) + logger.info(("Pass %d Batch %d : Cost %.6f, " + "average absolute diff scores: %.6f") % + (event.pass_id, event.batch_id, event.cost, diff)) + if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id), - "w") as f: + logger.info("\nTest with Pass %d, %s" % + (event.pass_id, result.metrics)) + with gzip.open( + os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" % + (event.pass_id)), "w") as f: trainer.save_parameter_to_tar(f) trainer.train( @@ -95,18 +111,17 @@ def train_ranknet(num_passes): num_passes=num_passes) -def ranknet_infer(pass_id): +def ranknet_infer(model_path): """ - load the trained model. And predict with plain txt input - """ - print "Begin to Infer..." + load the trained model. And predict with plain txt input + """ + logger.info("Begin to Infer...") feature_dim = 46 # we just need half_ranknet to predict a rank score, # which can be used in sort documents output = half_ranknet("infer", feature_dim) - parameters = paddle.parameters.Parameters.from_tar( - gzip.open("ranknet_params_%d.tar.gz" % (pass_id))) + parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) # load data of same query and relevance documents, # need ranknet to rank these candidates @@ -133,16 +148,55 @@ def ranknet_infer(pass_id): print "query_id : ", query_id, " ranknet rank document order : ", score -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Ranknet demo') - parser.add_argument("--run_type", type=str, help="run type is train|infer") +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="PaddlePaddle RankNet example.") + parser.add_argument( + "--run_type", + type=str, + help=("A flag indicating to run the training or the inferring task. " + "Available options are: train or infer."), + default="train") parser.add_argument( "--num_passes", type=int, - help="num of passes in train| infer pass number of model") + help="The number of passes to train the model.", + default=10) + parser.add_argument( + "--use_gpu", + type=bool, + help="A flag indicating whether to use the GPU device in training.", + default=False) + parser.add_argument( + "--trainer_count", + type=int, + help="The thread number used in training.", + default=1) + parser.add_argument( + "--model_save_dir", + type=str, + required=False, + help=("The path to save the trained models."), + default="models") + parser.add_argument( + "--test_model_path", + type=str, + required=False, + help=("This parameter works only in inferring task to " + "specify path of a trained model."), + default="") + args = parser.parse_args() - paddle.init(use_gpu=False, trainer_count=4) + if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + if args.run_type == "train": - train_ranknet(args.num_passes) + ranknet_train(args.num_passes, args.model_save_dir) elif args.run_type == "infer": - ranknet_infer(pass_id=args.pass_num - 1) + assert os.path.exists( + args.test_model_path), "The trained model does not exit." + ranknet_infer(args.test_model_path) + else: + logger.fatal(("A wrong value for parameter run type. " + "Available options are: train or infer.")) diff --git a/ltr/run_lambdarank.sh b/ltr/run_lambdarank.sh deleted file mode 100644 index 9546be2cb1fd19d66352091a88379f61283a1f6b..0000000000000000000000000000000000000000 --- a/ltr/run_lambdarank.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -python lambda_rank.py \ - --run_type="train" \ - --num_passes=10 \ - 2>&1 | tee lambdarank_train.log - -python lambda_rank.py \ - --run_type="infer" \ - --num_passes=10 \ - 2>&1 | tee lambdarank_infer.log diff --git a/ltr/run_ranknet.sh b/ltr/run_ranknet.sh deleted file mode 100644 index 8c574ffd4b036101d0a69e9c1f7dbc4360306d10..0000000000000000000000000000000000000000 --- a/ltr/run_ranknet.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -python ranknet.py \ - --run_type="train" \ - --num_passes=10 \ - 2>&1 | tee ranknet_train.log - -python ranknet.py \ - --run_type="infer" \ - --num_passes=10 \ - 2>&1 | tee ranknet_infer.log diff --git a/text_classification/utils.py b/text_classification/utils.py index d14054d331987b99d2775b2a146b8c13b0d0fab4..c9b0a85450199612c6bc6f56c812cbb9f71f501d 100644 --- a/text_classification/utils.py +++ b/text_classification/utils.py @@ -9,60 +9,60 @@ logger.setLevel(logging.INFO) def parse_train_cmd(): parser = argparse.ArgumentParser( - description="PaddlePaddle text classification demo") + description="PaddlePaddle text classification example.") parser.add_argument( "--nn_type", type=str, - help="define which type of network to use, available: [dnn, cnn]", + help=("A flag that defines which type of network to use, " + "available: [dnn, cnn]."), default="dnn") parser.add_argument( "--train_data_dir", type=str, required=False, - help=("path of training dataset (default: None). " - "if this parameter is not set, " - "paddle.dataset.imdb will be used."), + help=("The path of training dataset (default: None). If this parameter " + "is not set, paddle.dataset.imdb will be used."), default=None) parser.add_argument( "--test_data_dir", type=str, required=False, - help=("path of testing dataset (default: None). " - "if this parameter is not set, " - "paddle.dataset.imdb will be used."), + help=("The path of testing dataset (default: None). If this parameter " + "is not set, paddle.dataset.imdb will be used."), default=None) parser.add_argument( "--word_dict", type=str, required=False, - help=("path of word dictionary (default: None)." - "if this parameter is not set, paddle.dataset.imdb will be used." - "if this parameter is set, but the file does not exist, " - "word dictionay will be built from " - "the training data automatically."), + help=("The path of word dictionary (default: None). If this parameter " + "is not set, paddle.dataset.imdb will be used. If this parameter " + "is set, but the file does not exist, word dictionay " + "will be built from the training data automatically."), default=None) parser.add_argument( "--label_dict", type=str, required=False, - help=("path of label dictionay (default: None)." - "if this parameter is not set, paddle.dataset.imdb will be used." - "if this parameter is set, but the file does not exist, " - "word dictionay will be built from " - "the training data automatically."), + help=("The path of label dictionay (default: None).If this parameter " + "is not set, paddle.dataset.imdb will be used. If this parameter " + "is set, but the file does not exist, word dictionay " + "will be built from the training data automatically."), default=None) parser.add_argument( "--batch_size", type=int, default=32, - help="the number of training examples in one forward/backward pass") + help="The number of training examples in one forward/backward pass.") parser.add_argument( - "--num_passes", type=int, default=10, help="number of passes to train") + "--num_passes", + type=int, + default=10, + help="The number of passes to train the model.") parser.add_argument( "--model_save_dir", type=str, required=False, - help=("path to save the trained models."), + help=("The path to save the trained models."), default="models") return parser.parse_args()