ranknet.py 5.0 KB
Newer Older
D
dzhwinter 已提交
1 2
import os
import sys
D
dong zhihong 已提交
3
import gzip
4
import functools
D
dong zhihong 已提交
5 6
import paddle.v2 as paddle
import numpy as np
7
from metrics import ndcg
D
dongzhihong 已提交
8
import argparse
D
dong zhihong 已提交
9 10 11 12

# ranknet is the classic pairwise learning to rank algorithm
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf

13

D
dong zhihong 已提交
14
def half_ranknet(name_prefix, input_dim):
15
    """
C
caoying03 已提交
16 17 18 19 20
    parameter in same name will be shared in paddle framework,
    these parameters in ranknet can be used in shared state,
    e.g. left network and right network shared parameters in detail
    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
    """
21 22 23
    # data layer
    data = paddle.layer.data(name_prefix + "/data",
                             paddle.data_type.dense_vector(input_dim))
D
dong zhihong 已提交
24

25 26 27 28 29 30 31 32 33 34 35 36 37
    # hidden layer
    hd1 = paddle.layer.fc(
        input=data,
        size=10,
        act=paddle.activation.Tanh(),
        param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
    # fully connect layer/ output layer
    output = paddle.layer.fc(
        input=hd1,
        size=1,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
    return output
D
dong zhihong 已提交
38 39 40


def ranknet(input_dim):
41
    # label layer
D
dzhwinter 已提交
42
    label = paddle.layer.data("label", paddle.data_type.dense_vector(1))
43 44 45 46 47 48 49 50 51 52

    # reuse the parameter in half_ranknet
    output_left = half_ranknet("left", input_dim)
    output_right = half_ranknet("right", input_dim)

    evaluator = paddle.evaluator.auc(input=output_left, label=label)
    # rankcost layer
    cost = paddle.layer.rank_cost(
        name="cost", left=output_left, right=output_right, label=label)
    return cost
D
dong zhihong 已提交
53 54 55


def train_ranknet(num_passes):
56 57 58
    train_reader = paddle.batch(
        paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
        batch_size=100)
D
dong zhihong 已提交
59
    test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
60

D
dzhwinter 已提交
61
    # mq2007 feature_dim = 46, dense format
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
    # fc hidden_dim = 128
    feature_dim = 46
    cost = ranknet(feature_dim)
    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=2e-4))

    # Define the input data order
    feeding = {"label": 0, "left/data": 1, "right/data": 2}

    #  Define end batch and end pass event handler
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                print "Pass %d Batch %d Cost %.9f" % (
                    event.pass_id, event.batch_id, event.cost)
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_reader, feeding=feeding)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
            with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id),
                           "w") as f:
89
                trainer.save_parameter_to_tar(f)
90 91 92 93 94 95 96

    trainer.train(
        reader=train_reader,
        event_handler=event_handler,
        feeding=feeding,
        num_passes=num_passes)

D
dong zhihong 已提交
97 98

def ranknet_infer(pass_id):
99 100 101 102 103 104
    """
  load the trained model. And predict with plain txt input
  """
    print "Begin to Infer..."
    feature_dim = 46

C
caoying03 已提交
105 106
    # we just need half_ranknet to predict a rank score,
    # which can be used in sort documents
D
dongzhihong 已提交
107
    output = half_ranknet("infer", feature_dim)
108
    parameters = paddle.parameters.Parameters.from_tar(
D
dongzhihong 已提交
109
        gzip.open("ranknet_params_%d.tar.gz" % (pass_id)))
110

C
caoying03 已提交
111 112
    # load data of same query and relevance documents,
    # need ranknet to rank these candidates
D
dzhwinter 已提交
113
    infer_query_id = []
114
    infer_data = []
D
dzhwinter 已提交
115
    infer_doc_index = []
116 117 118 119 120 121 122

    # convert to mq2007 built-in data format
    # <query_id> <relevance_score> <feature_vector>
    plain_txt_test = functools.partial(
        paddle.dataset.mq2007.test, format="plain_txt")

    for query_id, relevance_score, feature_vector in plain_txt_test():
D
dzhwinter 已提交
123
        infer_query_id.append(query_id)
D
dongzhihong 已提交
124
        infer_data.append([feature_vector])
D
dzhwinter 已提交
125

C
caoying03 已提交
126 127
    # predict score of infer_data document.
    # Re-sort the document base on predict score
D
dzhwinter 已提交
128 129
    # in descending order. then we build the ranking documents
    scores = paddle.infer(
130
        output_layer=output, parameters=parameters, input=infer_data)
D
dongzhihong 已提交
131
    print scores
D
dzhwinter 已提交
132 133
    for query_id, score in zip(infer_query_id, scores):
        print "query_id : ", query_id, " ranknet rank document order : ", score
134

D
dong zhihong 已提交
135 136

if __name__ == '__main__':
D
dongzhihong 已提交
137 138 139 140 141 142 143
    parser = argparse.ArgumentParser(description='Ranknet demo')
    parser.add_argument("--run_type", type=str, help="run type is train|infer")
    parser.add_argument(
        "--num_passes",
        type=int,
        help="num of passes in train| infer pass number of model")
    args = parser.parse_args()
144
    paddle.init(use_gpu=False, trainer_count=4)
D
dongzhihong 已提交
145 146 147 148
    if args.run_type == "train":
        train_ranknet(args.num_passes)
    elif args.run_type == "infer":
        ranknet_infer(pass_id=args.pass_num - 1)