lambda_rank.py 4.0 KB
Newer Older
D
dong zhihong 已提交
1 2 3 4 5 6 7
import os, sys
import gzip
import paddle.v2 as paddle
import numpy as np
import functools


C
caoying03 已提交
8
def lambda_rank(input_dim):
D
dzhwinter 已提交
9
    """
C
caoying03 已提交
10
    lambda_rank is a Listwise rank model, the input data and label must be sequences.
D
dzhwinter 已提交
11 12 13 14
    https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
    parameters :
      input_dim, one document's dense feature vector dimension

C
caoying03 已提交
15 16
    format of the dense_vector_sequence:
    [[f, ...], [f, ...], ...], f is a float or an int number
D
dzhwinter 已提交
17
    """
18 19 20 21 22 23 24 25
    label = paddle.layer.data("label",
                              paddle.data_type.dense_vector_sequence(1))
    data = paddle.layer.data("data",
                             paddle.data_type.dense_vector_sequence(input_dim))

    # hidden layer
    hd1 = paddle.layer.fc(
        input=data,
D
dzhwinter 已提交
26 27 28 29 30 31
        size=128,
        act=paddle.activation.Tanh(),
        param_attr=paddle.attr.Param(initial_std=0.01))

    hd2 = paddle.layer.fc(
        input=hd1,
32 33 34 35
        size=10,
        act=paddle.activation.Tanh(),
        param_attr=paddle.attr.Param(initial_std=0.01))
    output = paddle.layer.fc(
D
dzhwinter 已提交
36
        input=hd2,
37 38 39
        size=1,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(initial_std=0.01))
D
dzhwinter 已提交
40 41 42 43

    # evaluator
    evaluator = paddle.evaluator.auc(input=output, label=label)
    # cost layer
44 45 46 47
    cost = paddle.layer.lambda_cost(
        input=output, score=label, NDCG_num=6, max_sort_size=-1)
    return cost, output

D
dong zhihong 已提交
48

C
caoying03 已提交
49
def train_lambda_rank(num_passes):
50 51 52 53 54 55 56
    # listwise input sequence
    fill_default_train = functools.partial(
        paddle.dataset.mq2007.train, format="listwise")
    fill_default_test = functools.partial(
        paddle.dataset.mq2007.test, format="listwise")
    train_reader = paddle.batch(
        paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
D
dong zhihong 已提交
57
    test_reader = paddle.batch(fill_default_test, batch_size=32)
58

D
dzhwinter 已提交
59
    # mq2007 input_dim = 46, dense format
60
    input_dim = 46
C
caoying03 已提交
61
    cost, output = lambda_rank(input_dim)
62 63 64 65 66 67
    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
D
dong zhihong 已提交
68

69 70 71 72 73 74 75 76
    #  Define end batch and end pass event handler
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id,
                                                  event.cost)
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_reader, feeding=feeding)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
C
caoying03 已提交
77
            with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id),
78 79 80 81 82 83 84 85 86
                           "w") as f:
                parameters.to_tar(f)

    feeding = {"label": 0, "data": 1}
    trainer.train(
        reader=train_reader,
        event_handler=event_handler,
        feeding=feeding,
        num_passes=num_passes)
D
dong zhihong 已提交
87 88


C
caoying03 已提交
89
def lambda_rank_infer(pass_id):
90
    """
C
caoying03 已提交
91
  lambda_rank model inference interface
92 93 94 95 96
  parameters:
    pass_id : inference model in pass_id
  """
    print "Begin to Infer..."
    input_dim = 46
C
caoying03 已提交
97
    output = lambda_rank(input_dim)
98
    parameters = paddle.parameters.Parameters.from_tar(
C
caoying03 已提交
99
        gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1)))
100 101 102

    infer_query_id = None
    infer_data = []
D
dzhwinter 已提交
103
    infer_data_num = 1
104 105 106 107 108 109
    fill_default_test = functools.partial(
        paddle.dataset.mq2007.test, format="listwise")
    for label, querylist in fill_default_test():
        infer_data.append(querylist)
        if len(infer_data) == infer_data_num:
            break
D
dzhwinter 已提交
110 111 112

    # predict score of infer_data document. Re-sort the document base on predict score
    # in descending order. then we build the ranking documents
113 114 115
    predicitons = paddle.infer(
        output_layer=output, parameters=parameters, input=infer_data)
    for i, score in enumerate(predicitons):
D
dzhwinter 已提交
116
        print i, score
117

D
dong zhihong 已提交
118 119

if __name__ == '__main__':
C
caoying03 已提交
120 121 122
    paddle.init(use_gpu=False, trainer_count=1)
    train_lambda_rank(2)
    lambda_rank_infer(pass_id=1)