lambda_rank.py 6.0 KB
Newer Older
C
caoying03 已提交
1 2
import os
import sys
D
dong zhihong 已提交
3 4
import gzip
import functools
D
dongzhihong 已提交
5
import argparse
C
caoying03 已提交
6
import logging
C
caoying03 已提交
7 8 9
import numpy as np

import paddle.v2 as paddle
D
dong zhihong 已提交
10

C
caoying03 已提交
11 12 13
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)

D
dong zhihong 已提交
14

P
peterzhang2029 已提交
15
def lambda_rank(input_dim, is_infer):
D
dzhwinter 已提交
16
    """
P
peterzhang2029 已提交
17
    LambdaRank is a listwise rank model, the input data and label
C
caoying03 已提交
18 19
    must be sequences.

D
dzhwinter 已提交
20 21 22 23
    https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
    parameters :
      input_dim, one document's dense feature vector dimension

P
peterzhang2029 已提交
24
    The format of the dense_vector_sequence is as follows:
C
caoying03 已提交
25
    [[f, ...], [f, ...], ...], f is a float or an int number
D
dzhwinter 已提交
26
    """
P
peterzhang2029 已提交
27 28 29
    if not is_infer:
        label = paddle.layer.data("label",
                                  paddle.data_type.dense_vector_sequence(1))
30 31 32
    data = paddle.layer.data("data",
                             paddle.data_type.dense_vector_sequence(input_dim))

C
caoying03 已提交
33
    # Define the hidden layer.
C
caoying03 已提交
34 35 36 37 38
    hd1 = paddle.layer.fc(
        input=data,
        size=128,
        act=paddle.activation.Tanh(),
        param_attr=paddle.attr.Param(initial_std=0.01))
C
caoying03 已提交
39

C
caoying03 已提交
40 41 42 43 44 45 46 47 48 49
    hd2 = paddle.layer.fc(
        input=hd1,
        size=10,
        act=paddle.activation.Tanh(),
        param_attr=paddle.attr.Param(initial_std=0.01))
    output = paddle.layer.fc(
        input=hd2,
        size=1,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(initial_std=0.01))
D
dzhwinter 已提交
50

P
peterzhang2029 已提交
51
    if not is_infer:
C
caoying03 已提交
52
        # Define the cost layer.
P
peterzhang2029 已提交
53 54 55 56
        cost = paddle.layer.lambda_cost(
            input=output, score=label, NDCG_num=6, max_sort_size=-1)
        return cost, output
    return output
57

D
dong zhihong 已提交
58

C
caoying03 已提交
59 60
def lambda_rank_train(num_passes, model_save_dir):
    # The input for LambdaRank must be a sequence.
61 62 63 64
    fill_default_train = functools.partial(
        paddle.dataset.mq2007.train, format="listwise")
    fill_default_test = functools.partial(
        paddle.dataset.mq2007.test, format="listwise")
P
peterzhang2029 已提交
65

66
    train_reader = paddle.batch(
C
caoying03 已提交
67
        paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
D
dong zhihong 已提交
68
    test_reader = paddle.batch(fill_default_test, batch_size=32)
69

P
peterzhang2029 已提交
70
    # Training dataset: mq2007, input_dim = 46, dense format.
71
    input_dim = 46
P
peterzhang2029 已提交
72
    cost, output = lambda_rank(input_dim, is_infer=False)
73 74 75 76 77 78
    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
D
dong zhihong 已提交
79

P
peterzhang2029 已提交
80
    #  Define end batch and end pass event handler.
81 82
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
C
caoying03 已提交
83 84
            logger.info("Pass %d Batch %d Cost %.9f" %
                        (event.pass_id, event.batch_id, event.cost))
85 86
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_reader, feeding=feeding)
C
caoying03 已提交
87 88 89 90 91
            logger.info("\nTest with Pass %d, %s" %
                        (event.pass_id, result.metrics))
            with gzip.open(
                    os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz"
                                 % (event.pass_id)), "w") as f:
92
                trainer.save_parameter_to_tar(f)
93 94 95 96 97 98 99

    feeding = {"label": 0, "data": 1}
    trainer.train(
        reader=train_reader,
        event_handler=event_handler,
        feeding=feeding,
        num_passes=num_passes)
D
dong zhihong 已提交
100 101


C
caoying03 已提交
102
def lambda_rank_infer(test_model_path):
P
peterzhang2029 已提交
103
    """LambdaRank model inference interface.
C
caoying03 已提交
104

P
peterzhang2029 已提交
105
    Parameters:
C
caoying03 已提交
106
        test_model_path : The path of the trained model.
107
    """
C
caoying03 已提交
108
    logger.info("Begin to Infer...")
109
    input_dim = 46
P
peterzhang2029 已提交
110
    output = lambda_rank(input_dim, is_infer=True)
111
    parameters = paddle.parameters.Parameters.from_tar(
C
caoying03 已提交
112
        gzip.open(test_model_path))
113 114 115

    infer_query_id = None
    infer_data = []
D
dzhwinter 已提交
116
    infer_data_num = 1
P
peterzhang2029 已提交
117

118 119 120
    fill_default_test = functools.partial(
        paddle.dataset.mq2007.test, format="listwise")
    for label, querylist in fill_default_test():
P
peterzhang2029 已提交
121
        infer_data.append([querylist])
122 123
        if len(infer_data) == infer_data_num:
            break
D
dzhwinter 已提交
124

P
peterzhang2029 已提交
125 126 127
    # Predict score of infer_data document.
    # Re-sort the document base on predict score.
    # In descending order. then we build the ranking documents.
128 129 130
    predicitons = paddle.infer(
        output_layer=output, parameters=parameters, input=infer_data)
    for i, score in enumerate(predicitons):
D
dzhwinter 已提交
131
        print i, score
132

D
dong zhihong 已提交
133 134

if __name__ == '__main__':
C
caoying03 已提交
135 136 137 138 139 140 141 142
    parser = argparse.ArgumentParser(
        description="PaddlePaddle LambdaRank example.")
    parser.add_argument(
        "--run_type",
        type=str,
        help=("A flag indicating to run the training or the inferring task. "
              "Available options are: train or infer."),
        default="train")
D
dongzhihong 已提交
143 144 145
    parser.add_argument(
        "--num_passes",
        type=int,
C
caoying03 已提交
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
        help="The number of passes to train the model.",
        default=10)
    parser.add_argument(
        "--use_gpu",
        type=bool,
        help="A flag indicating whether to use the GPU device in training.",
        default=False)
    parser.add_argument(
        "--trainer_count",
        type=int,
        help="The thread number used in training.",
        default=1)
    parser.add_argument(
        "--model_save_dir",
        type=str,
        required=False,
        help=("The path to save the trained models."),
        default="models")
    parser.add_argument(
        "--test_model_path",
        type=str,
        required=False,
        help=("This parameter works only in inferring task to "
              "specify path of a trained model."),
        default="")

D
dongzhihong 已提交
172
    args = parser.parse_args()
C
caoying03 已提交
173
    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
D
dongzhihong 已提交
174
    if args.run_type == "train":
C
caoying03 已提交
175
        lambda_rank_train(args.num_passes, args.model_save_dir)
D
dongzhihong 已提交
176
    elif args.run_type == "infer":
C
caoying03 已提交
177 178 179 180 181 182
        assert os.path.exists(args.test_model_path), (
            "The trained model does not exit. Please set a correct path.")
        lambda_rank_infer(args.test_model_path)
    else:
        logger.fatal(("A wrong value for parameter run type. "
                      "Available options are: train or infer."))