ranknet.py 4.9 KB
Newer Older
D
dzhwinter 已提交
1 2
import os
import sys
D
dong zhihong 已提交
3
import gzip
4
import functools
D
dong zhihong 已提交
5 6
import paddle.v2 as paddle
import numpy as np
7
from metrics import ndcg
D
dongzhihong 已提交
8
import argparse
D
dong zhihong 已提交
9 10 11 12

# ranknet is the classic pairwise learning to rank algorithm
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf

13

D
dong zhihong 已提交
14
def half_ranknet(name_prefix, input_dim):
15 16 17 18 19 20 21 22 23
    """
  parameter in same name will be shared in paddle framework,
  these parameters in ranknet can be used in shared state, e.g. left network and right network
  shared parameters in detail
  https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
  """
    # data layer
    data = paddle.layer.data(name_prefix + "/data",
                             paddle.data_type.dense_vector(input_dim))
D
dong zhihong 已提交
24

25 26 27 28 29 30 31 32 33 34 35 36 37
    # hidden layer
    hd1 = paddle.layer.fc(
        input=data,
        size=10,
        act=paddle.activation.Tanh(),
        param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
    # fully connect layer/ output layer
    output = paddle.layer.fc(
        input=hd1,
        size=1,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
    return output
D
dong zhihong 已提交
38 39 40


def ranknet(input_dim):
41
    # label layer
D
dzhwinter 已提交
42
    label = paddle.layer.data("label", paddle.data_type.dense_vector(1))
43 44 45 46 47 48 49 50 51 52

    # reuse the parameter in half_ranknet
    output_left = half_ranknet("left", input_dim)
    output_right = half_ranknet("right", input_dim)

    evaluator = paddle.evaluator.auc(input=output_left, label=label)
    # rankcost layer
    cost = paddle.layer.rank_cost(
        name="cost", left=output_left, right=output_right, label=label)
    return cost
D
dong zhihong 已提交
53 54 55


def train_ranknet(num_passes):
56 57 58
    train_reader = paddle.batch(
        paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
        batch_size=100)
D
dong zhihong 已提交
59
    test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
60

D
dzhwinter 已提交
61
    # mq2007 feature_dim = 46, dense format
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
    # fc hidden_dim = 128
    feature_dim = 46
    cost = ranknet(feature_dim)
    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=2e-4))

    # Define the input data order
    feeding = {"label": 0, "left/data": 1, "right/data": 2}

    #  Define end batch and end pass event handler
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                print "Pass %d Batch %d Cost %.9f" % (
                    event.pass_id, event.batch_id, event.cost)
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_reader, feeding=feeding)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
            with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id),
                           "w") as f:
                parameters.to_tar(f)

    trainer.train(
        reader=train_reader,
        event_handler=event_handler,
        feeding=feeding,
        num_passes=num_passes)

D
dong zhihong 已提交
97 98

def ranknet_infer(pass_id):
99 100 101 102 103 104 105
    """
  load the trained model. And predict with plain txt input
  """
    print "Begin to Infer..."
    feature_dim = 46

    # we just need half_ranknet to predict a rank score, which can be used in sort documents
D
dongzhihong 已提交
106
    output = half_ranknet("infer", feature_dim)
107
    parameters = paddle.parameters.Parameters.from_tar(
D
dongzhihong 已提交
108
        gzip.open("ranknet_params_%d.tar.gz" % (pass_id)))
109 110

    # load data of same query and relevance documents, need ranknet to rank these candidates
D
dzhwinter 已提交
111
    infer_query_id = []
112
    infer_data = []
D
dzhwinter 已提交
113
    infer_doc_index = []
114 115 116 117 118 119 120

    # convert to mq2007 built-in data format
    # <query_id> <relevance_score> <feature_vector>
    plain_txt_test = functools.partial(
        paddle.dataset.mq2007.test, format="plain_txt")

    for query_id, relevance_score, feature_vector in plain_txt_test():
D
dzhwinter 已提交
121
        infer_query_id.append(query_id)
D
dongzhihong 已提交
122
        infer_data.append([feature_vector])
D
dzhwinter 已提交
123 124 125 126

    # predict score of infer_data document. Re-sort the document base on predict score
    # in descending order. then we build the ranking documents
    scores = paddle.infer(
127
        output_layer=output, parameters=parameters, input=infer_data)
D
dongzhihong 已提交
128
    print scores
D
dzhwinter 已提交
129 130
    for query_id, score in zip(infer_query_id, scores):
        print "query_id : ", query_id, " ranknet rank document order : ", score
131

D
dong zhihong 已提交
132 133

if __name__ == '__main__':
D
dongzhihong 已提交
134 135 136 137 138 139 140
    parser = argparse.ArgumentParser(description='Ranknet demo')
    parser.add_argument("--run_type", type=str, help="run type is train|infer")
    parser.add_argument(
        "--num_passes",
        type=int,
        help="num of passes in train| infer pass number of model")
    args = parser.parse_args()
141
    paddle.init(use_gpu=False, trainer_count=4)
D
dongzhihong 已提交
142 143 144 145
    if args.run_type == "train":
        train_ranknet(args.num_passes)
    elif args.run_type == "infer":
        ranknet_infer(pass_id=args.pass_num - 1)