提交 b5d37692 编写于 作者: Z zhangcr

first PR of query-relationship

上级 bd6f3b90
data/MQ2007
data/train.list
data/vali.list
data/pred.list
*.log
output/
*.pyc
core
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR="$(cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
#Download MQ2007 dataset
echo "Downloading query-docs data..."
wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar
#Extract package
echo "Unzipping..."
unrar x MQ2007.rar
#Remove compressed package
rm MQ2007.rar
echo "data/MQ2007/Fold1/train.txt" > train.list
echo "data/MQ2007/Fold1/vali.txt" > test.list
echo "data/MQ2007/Fold1/test.txt" > pred.list
echo "Done."
data/MQ2007/Fold1/vali.txt
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
#Define a data provider for "query relationship"
@provider(
input_types={
'features1': dense_vector(46),
'features2': dense_vector(46),
'label': dense_vector(1)
},
should_shuffle=False,
cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_name):
with open(file_name) as f:
pre_qid = -1
feats1 = []
feats2 = []
l1 = 0
l2 = 0
for line in f:
line = line.split('#')[0]
if len(line.split()) < 48:
continue
qid = int(line.split()[1].split(':')[1])
if pre_qid != qid:
feats1 = []
for term in line.split()[2:48]:
feats1.append(float(term.split(':')[1]))
l1 = int(line.split()[0])
pre_qid = qid
feats2 = feats1
yield feats1, feats2, [0.5]
else:
feats1 = feats2
feats2 = []
l1 = l2
for term in line.split()[2:48]:
feats2.append(float(term.split(':')[1]))
l2 = int(line.split()[0])
p12 = 0.5
if l1 > l2:
p12 = 1
if l1 < l2:
p12 = 0
yield feats1, feats2, [p12]
@provider(input_types={'features': dense_vector(46)})
def process_predict(settings, file_name):
with open(file_name) as f:
for line in f:
feats = []
line = line.split('#')[0]
for term in line.split()[2:48]:
feats.append(float(term.split(':')[1]))
yield feats
#!/usr/bin/python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import re
import math
def get_best_pass(log_filename):
with open(log_filename, 'r') as f:
text = f.read()
pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
re.S)
results = re.findall(pattern, text)
sorted_results = sorted(results, key=lambda result: -float(result[0]))
return sorted_results[0]
log_filename = sys.argv[1]
log = get_best_pass(log_filename)
print 'Best pass is %s, rank-cost is %s' % (log[1], log[0])
evaluate_pass = "output/pass-%s" % log[1]
print "evaluating from pass %s" % evaluate_pass
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# pass choice
model="output/pass-00001"
paddle train \
--config=trainer_config.py \
--use_gpu=false \
--job=test \
--init_model_path=$model \
--config_args=is_predict=1 \
--predict_output_dir=./
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
paddle train \
--config=trainer_config.py \
--save_dir=./output \
--use_gpu=false \
--trainer_count=1 \
--test_all_data_in_one_period=true \
--num_passes=10 \
--log_period=500 2>&1 | tee 'train.log'
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
# 1. read data
define_py_data_sources2(
train_list=trn, test_list=tst, module='dataprovider', obj=process)
# 2. learning algorithm
batch_size = 5 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=1e-3,
learning_method=RMSPropOptimizer())
# 3. Network configuration
feature_num = 46
hid_num = 6
if not is_predict:
x1 = data_layer(name='features1', size=feature_num)
x2 = data_layer(name='features2', size=feature_num)
y = data_layer(name='label', size=1)
hidden1 = fc_layer(
name='hidden', input=x1, size=hid_num, act=LinearActivation())
hidden2 = LayerOutput('hidden', LayerType.FC_LAYER, x2, size=feature_num)
y1 = fc_layer(name='output', input=hidden1, size=1, act=SigmoidActivation())
y2 = LayerOutput('output', LayerType.FC_LAYER, hidden2, size=1)
outputs(rank_cost(left=y1, right=y2, label=y))
else:
x = data_layer(name='features', size=feature_num)
hidden = fc_layer(
name='hidden', input=x, size=hid_num, act=LinearActivation())
y_pred = fc_layer(
name='output', input=hidden, size=1, act=SigmoidActivation())
outputs(y_pred)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册