Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
eb88169e
M
models
项目概览
PaddlePaddle
/
models
大约 1 年 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
eb88169e
编写于
11月 23, 2017
作者:
C
caoying03
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix LTR example.
上级
ede5a045
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
182 addition
and
112 deletion
+182
-112
ltr/lambda_rank.py
ltr/lambda_rank.py
+76
-38
ltr/metrics.py
ltr/metrics.py
+1
-1
ltr/ranknet.py
ltr/ranknet.py
+84
-30
ltr/run_lambdarank.sh
ltr/run_lambdarank.sh
+0
-11
ltr/run_ranknet.sh
ltr/run_ranknet.sh
+0
-11
text_classification/utils.py
text_classification/utils.py
+21
-21
未找到文件。
ltr/lambda_rank.py
浏览文件 @
eb88169e
...
...
@@ -3,10 +3,14 @@ import sys
import
gzip
import
functools
import
argparse
import
logging
import
numpy
as
np
import
paddle.v2
as
paddle
logger
=
logging
.
getLogger
(
"paddle"
)
logger
.
setLevel
(
logging
.
INFO
)
def
lambda_rank
(
input_dim
,
is_infer
):
"""
...
...
@@ -26,43 +30,39 @@ def lambda_rank(input_dim, is_infer):
data
=
paddle
.
layer
.
data
(
"data"
,
paddle
.
data_type
.
dense_vector_sequence
(
input_dim
))
# Define hidden layer.
hd1
=
paddle
.
layer
.
fc
(
input
=
data
,
size
=
128
,
act
=
paddle
.
activation
.
Tanh
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
))
hd2
=
paddle
.
layer
.
fc
(
input
=
hd1
,
size
=
10
,
act
=
paddle
.
activation
.
Tanh
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
))
output
=
paddle
.
layer
.
fc
(
input
=
hd2
,
size
=
1
,
act
=
paddle
.
activation
.
Linear
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
))
# Define the hidden layer.
hd1
=
paddle
.
layer
.
fc
(
input
=
data
,
size
=
128
,
act
=
paddle
.
activation
.
Tanh
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
))
hd2
=
paddle
.
layer
.
fc
(
input
=
hd1
,
size
=
10
,
act
=
paddle
.
activation
.
Tanh
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
))
output
=
paddle
.
layer
.
fc
(
input
=
hd2
,
size
=
1
,
act
=
paddle
.
activation
.
Linear
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
))
if
not
is_infer
:
# Define evaluator.
evaluator
=
paddle
.
evaluator
.
auc
(
input
=
output
,
label
=
label
)
# Define cost layer.
# Define the cost layer.
cost
=
paddle
.
layer
.
lambda_cost
(
input
=
output
,
score
=
label
,
NDCG_num
=
6
,
max_sort_size
=-
1
)
return
cost
,
output
return
output
def
train_lambda_rank
(
num_passes
):
# The input for LambdaRank
is
a sequence.
def
lambda_rank_train
(
num_passes
,
model_save_dir
):
# The input for LambdaRank
must be
a sequence.
fill_default_train
=
functools
.
partial
(
paddle
.
dataset
.
mq2007
.
train
,
format
=
"listwise"
)
fill_default_test
=
functools
.
partial
(
paddle
.
dataset
.
mq2007
.
test
,
format
=
"listwise"
)
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
fill_default_train
,
buf_size
=
100
),
batch_size
=
32
)
paddle
.
reader
.
shuffle
(
fill_default_train
,
buf_size
=
100
),
batch_size
=
32
)
test_reader
=
paddle
.
batch
(
fill_default_test
,
batch_size
=
32
)
# Training dataset: mq2007, input_dim = 46, dense format.
...
...
@@ -78,13 +78,15 @@ def train_lambda_rank(num_passes):
# Define end batch and end pass event handler.
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
print
"Pass %d Batch %d Cost %.9f"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
)
logger
.
info
(
"Pass %d Batch %d Cost %.9f"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
)
)
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_reader
,
feeding
=
feeding
)
print
"
\n
Test with Pass %d, %s"
%
(
event
.
pass_id
,
result
.
metrics
)
with
gzip
.
open
(
"lambda_rank_params_%d.tar.gz"
%
(
event
.
pass_id
),
"w"
)
as
f
:
logger
.
info
(
"
\n
Test with Pass %d, %s"
%
(
event
.
pass_id
,
result
.
metrics
))
with
gzip
.
open
(
os
.
path
.
join
(
model_save_dir
,
"lambda_rank_params_%d.tar.gz"
%
(
event
.
pass_id
)),
"w"
)
as
f
:
trainer
.
save_parameter_to_tar
(
f
)
feeding
=
{
"label"
:
0
,
"data"
:
1
}
...
...
@@ -95,17 +97,17 @@ def train_lambda_rank(num_passes):
num_passes
=
num_passes
)
def
lambda_rank_infer
(
pass_id
):
def
lambda_rank_infer
(
test_model_path
):
"""LambdaRank model inference interface.
Parameters:
pass_id : inference model in pass_id
test_model_path : The path of the trained model.
"""
print
"Begin to Infer..."
logger
.
info
(
"Begin to Infer..."
)
input_dim
=
46
output
=
lambda_rank
(
input_dim
,
is_infer
=
True
)
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
"lambda_rank_params_%d.tar.gz"
%
(
pass_id
-
1
)
))
gzip
.
open
(
test_model_path
))
infer_query_id
=
None
infer_data
=
[]
...
...
@@ -128,15 +130,51 @@ def lambda_rank_infer(pass_id):
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'LambdaRank demo'
)
parser
.
add_argument
(
"--run_type"
,
type
=
str
,
help
=
"run type is train|infer"
)
parser
=
argparse
.
ArgumentParser
(
description
=
"PaddlePaddle LambdaRank example."
)
parser
.
add_argument
(
"--run_type"
,
type
=
str
,
help
=
(
"A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."
),
default
=
"train"
)
parser
.
add_argument
(
"--num_passes"
,
type
=
int
,
help
=
"The Num of passes in train| infer pass number of model."
)
help
=
"The number of passes to train the model."
,
default
=
10
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
bool
,
help
=
"A flag indicating whether to use the GPU device in training."
,
default
=
False
)
parser
.
add_argument
(
"--trainer_count"
,
type
=
int
,
help
=
"The thread number used in training."
,
default
=
1
)
parser
.
add_argument
(
"--model_save_dir"
,
type
=
str
,
required
=
False
,
help
=
(
"The path to save the trained models."
),
default
=
"models"
)
parser
.
add_argument
(
"--test_model_path"
,
type
=
str
,
required
=
False
,
help
=
(
"This parameter works only in inferring task to "
"specify path of a trained model."
),
default
=
""
)
args
=
parser
.
parse_args
()
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
if
args
.
run_type
==
"train"
:
train_lambda_rank
(
args
.
num_passes
)
lambda_rank_train
(
args
.
num_passes
,
args
.
model_save_dir
)
elif
args
.
run_type
==
"infer"
:
lambda_rank_infer
(
pass_id
=
args
.
num_passes
-
1
)
assert
os
.
path
.
exists
(
args
.
test_model_path
),
(
"The trained model does not exit. Please set a correct path."
)
lambda_rank_infer
(
args
.
test_model_path
)
else
:
logger
.
fatal
((
"A wrong value for parameter run type. "
"Available options are: train or infer."
))
ltr/metrics.py
浏览文件 @
eb88169e
...
...
@@ -10,7 +10,7 @@ def ndcg(score_list):
score_list: np.array, shape=(sample_num,1)
e.g. predict rank score list :
>>> scores = [3, 2, 3, 0, 1, 2]
>>> scores = [3, 2, 3, 0, 1, 2]
>>> ndcg_score = ndcg(scores)
"""
...
...
ltr/ranknet.py
浏览文件 @
eb88169e
...
...
@@ -2,15 +2,23 @@ import os
import
sys
import
gzip
import
functools
import
paddle.v2
as
paddle
import
numpy
as
np
from
metrics
import
ndcg
import
argparse
import
logging
import
numpy
as
np
import
paddle.v2
as
paddle
logger
=
logging
.
getLogger
(
"paddle"
)
logger
.
setLevel
(
logging
.
INFO
)
# ranknet is the classic pairwise learning to rank algorithm
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
def
score_diff
(
right_score
,
left_score
):
return
np
.
average
(
np
.
abs
(
right_score
-
left_score
))
def
half_ranknet
(
name_prefix
,
input_dim
):
"""
parameter in same name will be shared in paddle framework,
...
...
@@ -19,18 +27,21 @@ def half_ranknet(name_prefix, input_dim):
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
"""
# data layer
data
=
paddle
.
layer
.
data
(
name_prefix
+
"
/
data"
,
data
=
paddle
.
layer
.
data
(
name_prefix
+
"
_
data"
,
paddle
.
data_type
.
dense_vector
(
input_dim
))
# hidden layer
hd1
=
paddle
.
layer
.
fc
(
input
=
data
,
name
=
name_prefix
+
"_hidden"
,
size
=
10
,
act
=
paddle
.
activation
.
Tanh
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
,
name
=
"hidden_w1"
))
# fully connect layer/ output layer
# fully connected layer and output layer
output
=
paddle
.
layer
.
fc
(
input
=
hd1
,
name
=
name_prefix
+
"_score"
,
size
=
1
,
act
=
paddle
.
activation
.
Linear
(),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.01
,
name
=
"output"
))
...
...
@@ -45,14 +56,13 @@ def ranknet(input_dim):
output_left
=
half_ranknet
(
"left"
,
input_dim
)
output_right
=
half_ranknet
(
"right"
,
input_dim
)
evaluator
=
paddle
.
evaluator
.
auc
(
input
=
output_left
,
label
=
label
)
# rankcost layer
cost
=
paddle
.
layer
.
rank_cost
(
name
=
"cost"
,
left
=
output_left
,
right
=
output_right
,
label
=
label
)
return
cost
def
train_ranknet
(
num_passes
):
def
ranknet_train
(
num_passes
,
model_save_dir
):
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
mq2007
.
train
,
buf_size
=
100
),
batch_size
=
100
)
...
...
@@ -70,22 +80,28 @@ def train_ranknet(num_passes):
update_equation
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
2e-4
))
# Define the input data order
feeding
=
{
"label"
:
0
,
"left
/data"
:
1
,
"right/
data"
:
2
}
feeding
=
{
"label"
:
0
,
"left
_data"
:
1
,
"right_
data"
:
2
}
# Define end batch and end pass event handler
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
:
print
"Pass %d Batch %d Cost %.9f"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
)
else
:
sys
.
stdout
.
write
(
"."
)
sys
.
stdout
.
flush
()
if
event
.
batch_id
%
25
==
0
:
diff
=
score_diff
(
event
.
gm
.
getLayerOutputs
(
"right_score"
)[
"right_score"
][
"value"
],
event
.
gm
.
getLayerOutputs
(
"left_score"
)[
"left_score"
][
"value"
])
logger
.
info
((
"Pass %d Batch %d : Cost %.6f, "
"average absolute diff scores: %.6f"
)
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
diff
))
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_reader
,
feeding
=
feeding
)
print
"
\n
Test with Pass %d, %s"
%
(
event
.
pass_id
,
result
.
metrics
)
with
gzip
.
open
(
"ranknet_params_%d.tar.gz"
%
(
event
.
pass_id
),
"w"
)
as
f
:
logger
.
info
(
"
\n
Test with Pass %d, %s"
%
(
event
.
pass_id
,
result
.
metrics
))
with
gzip
.
open
(
os
.
path
.
join
(
model_save_dir
,
"ranknet_params_%d.tar.gz"
%
(
event
.
pass_id
)),
"w"
)
as
f
:
trainer
.
save_parameter_to_tar
(
f
)
trainer
.
train
(
...
...
@@ -95,18 +111,17 @@ def train_ranknet(num_passes):
num_passes
=
num_passes
)
def
ranknet_infer
(
pass_id
):
def
ranknet_infer
(
model_path
):
"""
load the trained model. And predict with plain txt input
"""
print
"Begin to Infer..."
load the trained model. And predict with plain txt input
"""
logger
.
info
(
"Begin to Infer..."
)
feature_dim
=
46
# we just need half_ranknet to predict a rank score,
# which can be used in sort documents
output
=
half_ranknet
(
"infer"
,
feature_dim
)
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
"ranknet_params_%d.tar.gz"
%
(
pass_id
)))
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
model_path
))
# load data of same query and relevance documents,
# need ranknet to rank these candidates
...
...
@@ -133,16 +148,55 @@ def ranknet_infer(pass_id):
print
"query_id : "
,
query_id
,
" ranknet rank document order : "
,
score
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Ranknet demo'
)
parser
.
add_argument
(
"--run_type"
,
type
=
str
,
help
=
"run type is train|infer"
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"PaddlePaddle RankNet example."
)
parser
.
add_argument
(
"--run_type"
,
type
=
str
,
help
=
(
"A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."
),
default
=
"train"
)
parser
.
add_argument
(
"--num_passes"
,
type
=
int
,
help
=
"num of passes in train| infer pass number of model"
)
help
=
"The number of passes to train the model."
,
default
=
10
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
bool
,
help
=
"A flag indicating whether to use the GPU device in training."
,
default
=
False
)
parser
.
add_argument
(
"--trainer_count"
,
type
=
int
,
help
=
"The thread number used in training."
,
default
=
1
)
parser
.
add_argument
(
"--model_save_dir"
,
type
=
str
,
required
=
False
,
help
=
(
"The path to save the trained models."
),
default
=
"models"
)
parser
.
add_argument
(
"--test_model_path"
,
type
=
str
,
required
=
False
,
help
=
(
"This parameter works only in inferring task to "
"specify path of a trained model."
),
default
=
""
)
args
=
parser
.
parse_args
()
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
4
)
if
not
os
.
path
.
exists
(
args
.
model_save_dir
):
os
.
mkdir
(
args
.
model_save_dir
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
if
args
.
run_type
==
"train"
:
train_ranknet
(
args
.
num_passes
)
ranknet_train
(
args
.
num_passes
,
args
.
model_save_dir
)
elif
args
.
run_type
==
"infer"
:
ranknet_infer
(
pass_id
=
args
.
pass_num
-
1
)
assert
os
.
path
.
exists
(
args
.
test_model_path
),
"The trained model does not exit."
ranknet_infer
(
args
.
test_model_path
)
else
:
logger
.
fatal
((
"A wrong value for parameter run type. "
"Available options are: train or infer."
))
ltr/run_lambdarank.sh
已删除
100644 → 0
浏览文件 @
ede5a045
#!/bin/sh
python lambda_rank.py
\
--run_type
=
"train"
\
--num_passes
=
10
\
2>&1 |
tee
lambdarank_train.log
python lambda_rank.py
\
--run_type
=
"infer"
\
--num_passes
=
10
\
2>&1 |
tee
lambdarank_infer.log
ltr/run_ranknet.sh
已删除
100644 → 0
浏览文件 @
ede5a045
#!/bin/sh
python ranknet.py
\
--run_type
=
"train"
\
--num_passes
=
10
\
2>&1 |
tee
ranknet_train.log
python ranknet.py
\
--run_type
=
"infer"
\
--num_passes
=
10
\
2>&1 |
tee
ranknet_infer.log
text_classification/utils.py
浏览文件 @
eb88169e
...
...
@@ -9,60 +9,60 @@ logger.setLevel(logging.INFO)
def
parse_train_cmd
():
parser
=
argparse
.
ArgumentParser
(
description
=
"PaddlePaddle text classification
demo
"
)
description
=
"PaddlePaddle text classification
example.
"
)
parser
.
add_argument
(
"--nn_type"
,
type
=
str
,
help
=
"define which type of network to use, available: [dnn, cnn]"
,
help
=
(
"A flag that defines which type of network to use, "
"available: [dnn, cnn]."
),
default
=
"dnn"
)
parser
.
add_argument
(
"--train_data_dir"
,
type
=
str
,
required
=
False
,
help
=
(
"path of training dataset (default: None). "
"if this parameter is not set, "
"paddle.dataset.imdb will be used."
),
help
=
(
"The path of training dataset (default: None). If this parameter "
"is not set, paddle.dataset.imdb will be used."
),
default
=
None
)
parser
.
add_argument
(
"--test_data_dir"
,
type
=
str
,
required
=
False
,
help
=
(
"path of testing dataset (default: None). "
"if this parameter is not set, "
"paddle.dataset.imdb will be used."
),
help
=
(
"The path of testing dataset (default: None). If this parameter "
"is not set, paddle.dataset.imdb will be used."
),
default
=
None
)
parser
.
add_argument
(
"--word_dict"
,
type
=
str
,
required
=
False
,
help
=
(
"path of word dictionary (default: None)."
"if this parameter is not set, paddle.dataset.imdb will be used."
"if this parameter is set, but the file does not exist, "
"word dictionay will be built from "
"the training data automatically."
),
help
=
(
"The path of word dictionary (default: None). If this parameter "
"is not set, paddle.dataset.imdb will be used. If this parameter "
"is set, but the file does not exist, word dictionay "
"will be built from the training data automatically."
),
default
=
None
)
parser
.
add_argument
(
"--label_dict"
,
type
=
str
,
required
=
False
,
help
=
(
"path of label dictionay (default: None)."
"if this parameter is not set, paddle.dataset.imdb will be used."
"if this parameter is set, but the file does not exist, "
"word dictionay will be built from "
"the training data automatically."
),
help
=
(
"The path of label dictionay (default: None).If this parameter "
"is not set, paddle.dataset.imdb will be used. If this parameter "
"is set, but the file does not exist, word dictionay "
"will be built from the training data automatically."
),
default
=
None
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
32
,
help
=
"
the number of training examples in one forward/backward pass
"
)
help
=
"
The number of training examples in one forward/backward pass.
"
)
parser
.
add_argument
(
"--num_passes"
,
type
=
int
,
default
=
10
,
help
=
"number of passes to train"
)
"--num_passes"
,
type
=
int
,
default
=
10
,
help
=
"The number of passes to train the model."
)
parser
.
add_argument
(
"--model_save_dir"
,
type
=
str
,
required
=
False
,
help
=
(
"path to save the trained models."
),
help
=
(
"
The
path to save the trained models."
),
default
=
"models"
)
return
parser
.
parse_args
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录