Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
d86fb1d1
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d86fb1d1
编写于
5月 12, 2017
作者:
D
dzhwinter
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
"precommit format with github style"
上级
82eb0fe4
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
150 addition
and
137 deletion
+150
-137
python/paddle/v2/dataset/mq2007.py
python/paddle/v2/dataset/mq2007.py
+150
-137
未找到文件。
python/paddle/v2/dataset/mq2007.py
浏览文件 @
d86fb1d1
...
...
@@ -23,7 +23,6 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
"""
import
os
import
random
import
functools
...
...
@@ -31,25 +30,24 @@ import rarfile
from
common
import
download
import
numpy
as
np
# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
URL
=
"http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
MD5
=
"7be1640ae95c6408dab0ae7207bdc706"
def
__initialize_meta_info__
():
"""
"""
download and extract the MQ2007 dataset
"""
fn
=
fetch
()
rar
=
rarfile
.
RarFile
(
fn
)
dirpath
=
os
.
path
.
dirname
(
fn
)
rar
.
extractall
(
path
=
dirpath
)
return
dirpath
fn
=
fetch
()
rar
=
rarfile
.
RarFile
(
fn
)
dirpath
=
os
.
path
.
dirname
(
fn
)
rar
.
extractall
(
path
=
dirpath
)
return
dirpath
class
Query
(
object
):
"""
"""
queries used for learning to rank algorithms. It is created from relevance scores, query-document feature vectors
Parameters:
...
...
@@ -63,79 +61,86 @@ class Query(object):
description : string
comment section in query doc pair data
"""
def
__init__
(
self
,
query_id
=-
1
,
relevance_score
=-
1
,
feature_vector
=
None
,
description
=
""
):
self
.
query_id
=
query_id
self
.
relevance_score
=
relevance_score
if
feature_vector
is
None
:
self
.
feature_vector
=
[]
else
:
self
.
feature_vector
=
feature_vector
self
.
description
=
description
def
__str__
(
self
):
string
=
"%s %s %s"
%
(
str
(
self
.
relevance_score
),
str
(
self
.
query_id
),
" "
.
join
(
str
(
f
)
for
f
in
self
.
feature_vector
))
return
string
def
__init__
(
self
,
query_id
=-
1
,
relevance_score
=-
1
,
feature_vector
=
None
,
description
=
""
):
self
.
query_id
=
query_id
self
.
relevance_score
=
relevance_score
if
feature_vector
is
None
:
self
.
feature_vector
=
[]
else
:
self
.
feature_vector
=
feature_vector
self
.
description
=
description
# @classmethod
def
_parse_
(
self
,
text
):
"""
def
__str__
(
self
):
string
=
"%s %s %s"
%
(
str
(
self
.
relevance_score
),
str
(
self
.
query_id
),
" "
.
join
(
str
(
f
)
for
f
in
self
.
feature_vector
))
return
string
# @classmethod
def
_parse_
(
self
,
text
):
"""
parse line into Query
"""
comment_position
=
text
.
find
(
'#'
)
line
=
text
[:
comment_position
].
strip
()
self
.
description
=
text
[
comment_position
+
1
:].
strip
()
parts
=
line
.
split
()
assert
(
len
(
parts
)
==
48
),
"expect 48 space split parts, get %d"
%
(
len
(
parts
))
# format : 0 qid:10 1:0.000272 2:0.000000 ....
self
.
relevance_score
=
int
(
parts
[
0
])
self
.
query_id
=
int
(
parts
[
1
].
split
(
':'
)[
1
])
for
p
in
parts
[
2
:]:
pair
=
p
.
split
(
':'
)
self
.
feature_vector
.
append
(
float
(
pair
[
1
]))
return
self
comment_position
=
text
.
find
(
'#'
)
line
=
text
[:
comment_position
].
strip
()
self
.
description
=
text
[
comment_position
+
1
:].
strip
()
parts
=
line
.
split
()
assert
(
len
(
parts
)
==
48
),
"expect 48 space split parts, get %d"
%
(
len
(
parts
))
# format : 0 qid:10 1:0.000272 2:0.000000 ....
self
.
relevance_score
=
int
(
parts
[
0
])
self
.
query_id
=
int
(
parts
[
1
].
split
(
':'
)[
1
])
for
p
in
parts
[
2
:]:
pair
=
p
.
split
(
':'
)
self
.
feature_vector
.
append
(
float
(
pair
[
1
]))
return
self
class
QueryList
(
object
):
"""
"""
group query into list, every item in list is a Query
"""
def
__init__
(
self
,
querylist
=
None
):
self
.
query_id
=
-
1
if
querylist
is
None
:
self
.
querylist
=
[]
else
:
self
.
querylist
=
querylist
for
query
in
self
.
querylist
:
def
__init__
(
self
,
querylist
=
None
):
self
.
query_id
=
-
1
if
querylist
is
None
:
self
.
querylist
=
[]
else
:
self
.
querylist
=
querylist
for
query
in
self
.
querylist
:
if
self
.
query_id
==
-
1
:
self
.
query_id
=
query
.
query_id
else
:
if
self
.
query_id
!=
query
.
query_id
:
raise
ValueError
(
"query in list must be same query_id"
)
def
__iter__
(
self
):
for
query
in
self
.
querylist
:
yield
query
def
__len__
(
self
):
return
len
(
self
.
querylist
)
def
_correct_ranking_
(
self
):
if
self
.
querylist
is
None
:
return
self
.
querylist
.
sort
(
key
=
lambda
x
:
x
.
relevance_score
,
reverse
=
True
)
def
_add_query
(
self
,
query
):
if
self
.
query_id
==
-
1
:
self
.
query_id
=
query
.
query_id
self
.
query_id
=
query
.
query_id
else
:
if
self
.
query_id
!=
query
.
query_id
:
raise
ValueError
(
"query in list must be same query_id"
)
def
__iter__
(
self
):
for
query
in
self
.
querylist
:
yield
query
def
__len__
(
self
):
return
len
(
self
.
querylist
)
def
_correct_ranking_
(
self
):
if
self
.
querylist
is
None
:
return
self
.
querylist
.
sort
(
key
=
lambda
x
:
x
.
relevance_score
,
reverse
=
True
)
def
_add_query
(
self
,
query
):
if
self
.
query_id
==
-
1
:
self
.
query_id
=
query
.
query_id
else
:
if
self
.
query_id
!=
query
.
query_id
:
raise
ValueError
(
"query in list must be same query_id"
)
self
.
querylist
.
append
(
query
)
if
self
.
query_id
!=
query
.
query_id
:
raise
ValueError
(
"query in list must be same query_id"
)
self
.
querylist
.
append
(
query
)
def
gen_pair
(
querylist
,
partial_order
=
"full"
):
"""
"""
gen pair for pair-wise learning to rank algorithm
Paramters:
--------
...
...
@@ -149,35 +154,41 @@ def gen_pair(querylist, partial_order="full"):
query_left : np.array, shape=(1, feature_dimension)
query_right : same as left
"""
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
querylist
.
_correct_ranking_
()
# C(n,2)
if
partial_order
==
"full"
:
for
i
,
query_left
in
enumerate
(
querylist
):
for
j
,
query_right
in
enumerate
(
querylist
):
if
query_left
.
relevance_score
>
query_right
.
relevance_score
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
else
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
elif
partial_order
==
"neighbour"
:
# C(n)
k
=
0
while
k
<
len
(
querylist
)
-
1
:
query_left
=
querylist
[
k
]
query_right
=
querylist
[
k
+
1
]
if
query_left
.
relevance_score
>
query_right
.
relevance_score
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
else
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
k
+=
1
else
:
raise
ValueError
(
"unsupport parameter of partial_order, Only can be neighbour or full"
)
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
querylist
.
_correct_ranking_
()
# C(n,2)
if
partial_order
==
"full"
:
for
i
,
query_left
in
enumerate
(
querylist
):
for
j
,
query_right
in
enumerate
(
querylist
):
if
query_left
.
relevance_score
>
query_right
.
relevance_score
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
else
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
elif
partial_order
==
"neighbour"
:
# C(n)
k
=
0
while
k
<
len
(
querylist
)
-
1
:
query_left
=
querylist
[
k
]
query_right
=
querylist
[
k
+
1
]
if
query_left
.
relevance_score
>
query_right
.
relevance_score
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
else
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
k
+=
1
else
:
raise
ValueError
(
"unsupport parameter of partial_order, Only can be neighbour or full"
)
def
gen_list
(
querylist
):
"""
"""
gen item in list for list-wise learning to rank algorithm
Paramters:
--------
...
...
@@ -188,41 +199,39 @@ def gen_list(querylist):
label : np.array, shape=(samples_num, )
querylist : np.array, shape=(samples_num, feature_dimension)
"""
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
querylist
.
_correct_ranking_
()
relevance_score_list
=
[
query
.
relevance_score
for
query
in
querylist
]
feature_vector_list
=
[
query
.
feature_vector
for
query
in
querylist
]
# yield np.array(relevance_score_list).T, np.array(feature_vector_list)
for
i
in
range
(
len
(
querylist
)):
yield
relevance_score_list
[
i
],
np
.
array
(
feature_vector_list
[
i
])
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
# querylist._correct_ranking_()
relevance_score_list
=
[
query
.
relevance_score
for
query
in
querylist
]
feature_vector_list
=
[
query
.
feature_vector
for
query
in
querylist
]
yield
np
.
array
(
relevance_score_list
).
T
,
np
.
array
(
feature_vector_list
)
def
load_from_text
(
filepath
,
shuffle
=
True
,
fill_missing
=-
1
):
"""
"""
parse data file into querys
"""
prev_query_id
=
-
1
;
querylists
=
[]
querylist
=
None
fn
=
__initialize_meta_info__
()
with
open
(
os
.
path
.
join
(
fn
,
filepath
))
as
f
:
for
line
in
f
:
query
=
Query
()
query
=
query
.
_parse_
(
line
)
if
query
.
query_id
!=
prev_query_id
:
if
querylist
is
not
None
:
querylists
.
append
(
querylist
)
querylist
=
QueryList
()
prev_query_id
=
query
.
query_id
querylist
.
_add_query
(
query
)
if
shuffle
==
True
:
random
.
shuffle
(
querylists
)
return
querylists
prev_query_id
=
-
1
querylists
=
[]
querylist
=
None
fn
=
__initialize_meta_info__
()
with
open
(
os
.
path
.
join
(
fn
,
filepath
))
as
f
:
for
line
in
f
:
query
=
Query
()
query
=
query
.
_parse_
(
line
)
if
query
.
query_id
!=
prev_query_id
:
if
querylist
is
not
None
:
querylists
.
append
(
querylist
)
querylist
=
QueryList
()
prev_query_id
=
query
.
query_id
querylist
.
_add_query
(
query
)
if
shuffle
==
True
:
random
.
shuffle
(
querylists
)
return
querylists
def
__reader__
(
filepath
,
format
=
"pairwise"
,
shuffle
=
True
,
fill_missing
=-
1
):
"""
"""
Parameters
--------
filename : string
...
...
@@ -235,23 +244,27 @@ def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
label query_left, query_right # format = "pairwise"
label querylist # format = "listwise"
"""
querylists
=
load_from_text
(
filepath
,
shuffle
=
shuffle
,
fill_missing
=
fill_missing
)
for
querylist
in
querylists
:
if
format
==
"pairwise"
:
for
pair
in
gen_pair
(
querylist
)
:
yield
pair
elif
format
==
"listwise"
:
# yield next(gen_list(querylist))
for
instance
in
gen_list
(
querylist
):
yield
instance
train
=
functools
.
partial
(
__reader__
,
filepath
=
"MQ2007/MQ2007/Fold1/train.txt"
)
querylists
=
load_from_text
(
filepath
,
shuffle
=
shuffle
,
fill_missing
=
fill_missing
)
for
querylist
in
querylists
:
if
format
==
"pairwise"
:
for
pair
in
gen_pair
(
querylist
):
yield
pair
elif
format
==
"listwise"
:
yield
next
(
gen_list
(
querylist
))
train
=
functools
.
partial
(
__reader__
,
filepath
=
"MQ2007/MQ2007/Fold1/train.txt"
)
test
=
functools
.
partial
(
__reader__
,
filepath
=
"MQ2007/MQ2007/Fold1/test.txt"
)
def
fetch
():
return
download
(
URL
,
"MQ2007"
,
MD5
)
return
download
(
URL
,
"MQ2007"
,
MD5
)
if
__name__
==
"__main__"
:
fetch
()
if
__name__
==
"__main__"
:
fetch
()
for
i
,
(
score
,
samples
)
in
enumerate
(
train
(
format
=
"listwise"
,
shuffle
=
False
)):
np
.
savetxt
(
"query_%d"
%
(
i
),
score
,
fmt
=
"%.2f"
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录