ranknet.py 4.5 KB
Newer Older
D
dzhwinter 已提交
1 2
import os
import sys
D
dong zhihong 已提交
3
import gzip
4
import functools
D
dong zhihong 已提交
5 6
import paddle.v2 as paddle
import numpy as np
7
from metrics import ndcg
D
dong zhihong 已提交
8 9 10 11

# ranknet is the classic pairwise learning to rank algorithm
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf

12

D
dong zhihong 已提交
13
def half_ranknet(name_prefix, input_dim):
14 15 16 17 18 19 20 21 22
    """
  parameter in same name will be shared in paddle framework,
  these parameters in ranknet can be used in shared state, e.g. left network and right network
  shared parameters in detail
  https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
  """
    # data layer
    data = paddle.layer.data(name_prefix + "/data",
                             paddle.data_type.dense_vector(input_dim))
D
dong zhihong 已提交
23

24 25 26 27 28 29 30 31 32 33 34 35 36
    # hidden layer
    hd1 = paddle.layer.fc(
        input=data,
        size=10,
        act=paddle.activation.Tanh(),
        param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
    # fully connect layer/ output layer
    output = paddle.layer.fc(
        input=hd1,
        size=1,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
    return output
D
dong zhihong 已提交
37 38 39


def ranknet(input_dim):
40
    # label layer
D
dzhwinter 已提交
41
    label = paddle.layer.data("label", paddle.data_type.dense_vector(1))
42 43 44 45 46 47 48 49 50 51

    # reuse the parameter in half_ranknet
    output_left = half_ranknet("left", input_dim)
    output_right = half_ranknet("right", input_dim)

    evaluator = paddle.evaluator.auc(input=output_left, label=label)
    # rankcost layer
    cost = paddle.layer.rank_cost(
        name="cost", left=output_left, right=output_right, label=label)
    return cost
D
dong zhihong 已提交
52 53 54


def train_ranknet(num_passes):
55 56 57
    train_reader = paddle.batch(
        paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
        batch_size=100)
D
dong zhihong 已提交
58
    test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
59

D
dzhwinter 已提交
60
    # mq2007 feature_dim = 46, dense format
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
    # fc hidden_dim = 128
    feature_dim = 46
    cost = ranknet(feature_dim)
    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=2e-4))

    # Define the input data order
    feeding = {"label": 0, "left/data": 1, "right/data": 2}

    #  Define end batch and end pass event handler
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                print "Pass %d Batch %d Cost %.9f" % (
                    event.pass_id, event.batch_id, event.cost)
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_reader, feeding=feeding)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
            with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id),
                           "w") as f:
                parameters.to_tar(f)

    trainer.train(
        reader=train_reader,
        event_handler=event_handler,
        feeding=feeding,
        num_passes=num_passes)

D
dong zhihong 已提交
96 97

def ranknet_infer(pass_id):
98 99 100 101 102 103 104 105 106 107 108 109
    """
  load the trained model. And predict with plain txt input
  """
    print "Begin to Infer..."
    feature_dim = 46

    # we just need half_ranknet to predict a rank score, which can be used in sort documents
    output = half_ranknet("left", feature_dim)
    parameters = paddle.parameters.Parameters.from_tar(
        gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1)))

    # load data of same query and relevance documents, need ranknet to rank these candidates
D
dzhwinter 已提交
110
    infer_query_id = []
111
    infer_data = []
D
dzhwinter 已提交
112
    infer_doc_index = []
113 114 115 116 117 118 119

    # convert to mq2007 built-in data format
    # <query_id> <relevance_score> <feature_vector>
    plain_txt_test = functools.partial(
        paddle.dataset.mq2007.test, format="plain_txt")

    for query_id, relevance_score, feature_vector in plain_txt_test():
D
dzhwinter 已提交
120
        infer_query_id.append(query_id)
121
        infer_data.append(feature_vector)
D
dzhwinter 已提交
122 123 124 125

    # predict score of infer_data document. Re-sort the document base on predict score
    # in descending order. then we build the ranking documents
    scores = paddle.infer(
126
        output_layer=output, parameters=parameters, input=infer_data)
D
dzhwinter 已提交
127 128
    for query_id, score in zip(infer_query_id, scores):
        print "query_id : ", query_id, " ranknet rank document order : ", score
129

D
dong zhihong 已提交
130 131

if __name__ == '__main__':
132
    paddle.init(use_gpu=False, trainer_count=4)
D
dzhwinter 已提交
133
    pass_num = 2
134 135
    train_ranknet(pass_num)
    ranknet_infer(pass_id=pass_num - 1)