first PR of query-relationship

b5d37692 · zhangcr · bd6f3b90 · b5d37692 · b5d37692 · b5d37692
8 changed file
--- a/query_relationship/.gitignore
+++ b/query_relationship/.gitignore
+data/MQ2007
+data/train.list
+data/vali.list
+data/pred.list
+*.log
+output/
+*.pyc
+core
--- a/query_relationship/data/getdata.sh
+++ b/query_relationship/data/getdata.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+DIR="$(cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+#Download MQ2007 dataset
+echo "Downloading query-docs data..."
+wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar
+
+#Extract package
+echo "Unzipping..."
+unrar x MQ2007.rar
+
+#Remove compressed package
+rm MQ2007.rar
+
+echo "data/MQ2007/Fold1/train.txt" > train.list
+echo "data/MQ2007/Fold1/vali.txt" > test.list
+echo "data/MQ2007/Fold1/test.txt" > pred.list
+
+echo "Done."
--- a/query_relationship/data/test.list
+++ b/query_relationship/data/test.list
+data/MQ2007/Fold1/vali.txt
--- a/query_relationship/dataprovider.py
+++ b/query_relationship/dataprovider.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+
+#Define a data provider for "query relationship"
+@provider(
+    input_types={
+        'features1': dense_vector(46),
+        'features2': dense_vector(46),
+        'label': dense_vector(1)
+    },
+    should_shuffle=False,
+    cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    with open(file_name) as f:
+        pre_qid = -1
+        feats1 = []
+        feats2 = []
+        l1 = 0
+        l2 = 0
+        for line in f:
+            line = line.split('#')[0]
+            if len(line.split()) < 48:
+                continue
+            qid = int(line.split()[1].split(':')[1])
+            if pre_qid != qid:
+                feats1 = []
+                for term in line.split()[2:48]:
+                    feats1.append(float(term.split(':')[1]))
+                l1 = int(line.split()[0])
+                pre_qid = qid
+                feats2 = feats1
+                yield feats1, feats2, [0.5]
+            else:
+                feats1 = feats2
+                feats2 = []
+                l1 = l2
+                for term in line.split()[2:48]:
+                    feats2.append(float(term.split(':')[1]))
+                l2 = int(line.split()[0])
+                p12 = 0.5
+                if l1 > l2:
+                    p12 = 1
+                if l1 < l2:
+                    p12 = 0
+                yield feats1, feats2, [p12]
+
+
+@provider(input_types={'features': dense_vector(46)})
+def process_predict(settings, file_name):
+    with open(file_name) as f:
+        for line in f:
+            feats = []
+            line = line.split('#')[0]
+            for term in line.split()[2:48]:
+                feats.append(float(term.split(':')[1]))
+            yield feats
--- a/query_relationship/evaluate.py
+++ b/query_relationship/evaluate.py
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+
+
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: -float(result[0]))
+        return sorted_results[0]
+
+
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+print 'Best pass is %s, rank-cost is %s' % (log[1], log[0])
+
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
--- a/query_relationship/predict.sh
+++ b/query_relationship/predict.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# pass choice 
+model="output/pass-00001"
+paddle train \
+    --config=trainer_config.py \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=./
--- a/query_relationship/train.sh
+++ b/query_relationship/train.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+paddle train \
+    --config=trainer_config.py \
+    --save_dir=./output \
+    --use_gpu=false \
+    --trainer_count=1 \
+    --test_all_data_in_one_period=true \
+    --num_passes=10 \
+    --log_period=500  2>&1 | tee 'train.log'
--- a/query_relationship/trainer_config.py
+++ b/query_relationship/trainer_config.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+
+# 1. read data
+define_py_data_sources2(
+    train_list=trn, test_list=tst, module='dataprovider', obj=process)
+
+# 2. learning algorithm
+batch_size = 5 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=1e-3,
+    learning_method=RMSPropOptimizer())
+
+# 3. Network configuration
+feature_num = 46
+hid_num = 6
+if not is_predict:
+    x1 = data_layer(name='features1', size=feature_num)
+    x2 = data_layer(name='features2', size=feature_num)
+    y = data_layer(name='label', size=1)
+    hidden1 = fc_layer(
+        name='hidden', input=x1, size=hid_num, act=LinearActivation())
+    hidden2 = LayerOutput('hidden', LayerType.FC_LAYER, x2, size=feature_num)
+    y1 = fc_layer(name='output', input=hidden1, size=1, act=SigmoidActivation())
+    y2 = LayerOutput('output', LayerType.FC_LAYER, hidden2, size=1)
+    outputs(rank_cost(left=y1, right=y2, label=y))
+else:
+    x = data_layer(name='features', size=feature_num)
+    hidden = fc_layer(
+        name='hidden', input=x, size=hid_num, act=LinearActivation())
+    y_pred = fc_layer(
+        name='output', input=hidden, size=1, act=SigmoidActivation())
+    outputs(y_pred)