Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a4313de8
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a4313de8
编写于
5月 15, 2017
作者:
D
dzhwinter
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
"remove the pairwise other genereate method"
上级
4ac5caaa
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
103 addition
and
36 deletion
+103
-36
python/paddle/v2/dataset/mq2007.py
python/paddle/v2/dataset/mq2007.py
+103
-36
未找到文件。
python/paddle/v2/dataset/mq2007.py
浏览文件 @
a4313de8
...
...
@@ -89,8 +89,10 @@ class Query(object):
line
=
text
[:
comment_position
].
strip
()
self
.
description
=
text
[
comment_position
+
1
:].
strip
()
parts
=
line
.
split
()
assert
(
len
(
parts
)
==
48
),
"expect 48 space split parts, get %d"
%
(
len
(
parts
))
if
len
(
parts
)
!=
48
:
sys
.
stdout
.
write
(
"expect 48 space split parts, get %d"
%
(
len
(
parts
)))
return
None
# format : 0 qid:10 1:0.000272 2:0.000000 ....
self
.
relevance_score
=
int
(
parts
[
0
])
self
.
query_id
=
int
(
parts
[
1
].
split
(
':'
)[
1
])
...
...
@@ -125,6 +127,9 @@ class QueryList(object):
def
__len__
(
self
):
return
len
(
self
.
querylist
)
def
__getitem__
(
self
,
i
):
return
self
.
querylist
[
i
]
def
_correct_ranking_
(
self
):
if
self
.
querylist
is
None
:
return
...
...
@@ -139,6 +144,46 @@ class QueryList(object):
self
.
querylist
.
append
(
query
)
def
gen_plain_txt
(
querylist
):
"""
gen plain text in list for other usage
Paramters:
--------
querylist : querylist, one query match many docment pairs in list, see QueryList
return :
------
query_id : np.array, shape=(samples_num, )
label : np.array, shape=(samples_num, )
querylist : np.array, shape=(samples_num, feature_dimension)
"""
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
querylist
.
_correct_ranking_
()
for
query
in
querylist
:
yield
querylist
.
query_id
,
query
.
relevance_score
,
np
.
array
(
query
.
feature_vector
)
def
gen_point
(
querylist
):
"""
gen item in list for point-wise learning to rank algorithm
Paramters:
--------
querylist : querylist, one query match many docment pairs in list, see QueryList
return :
------
label : np.array, shape=(samples_num, )
querylist : np.array, shape=(samples_num, feature_dimension)
"""
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
querylist
.
_correct_ranking_
()
for
query
in
querylist
:
yield
query
.
relevance_score
,
np
.
array
(
query
.
feature_vector
)
def
gen_pair
(
querylist
,
partial_order
=
"full"
):
"""
gen pair for pair-wise learning to rank algorithm
...
...
@@ -146,6 +191,7 @@ def gen_pair(querylist, partial_order="full"):
--------
querylist : querylist, one query match many docment pairs in list, see QueryList
pairtial_order : "full" or "neighbour"
there is redudant in all possiable pair combinations, which can be simplifed
gen pairs for neighbour items or the full partial order pairs
return :
...
...
@@ -157,34 +203,28 @@ def gen_pair(querylist, partial_order="full"):
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
querylist
.
_correct_ranking_
()
labels
=
[]
docpairs
=
[]
# C(n,2)
if
partial_order
==
"full"
:
for
i
,
query_left
in
enumerate
(
querylist
):
for
j
,
query_right
in
enumerate
(
querylist
):
if
query_left
.
relevance_score
>
query_right
.
relevance_score
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
else
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
elif
partial_order
==
"neighbour"
:
# C(n)
k
=
0
while
k
<
len
(
querylist
)
-
1
:
query_left
=
querylist
[
k
]
query_right
=
querylist
[
k
+
1
]
for
i
in
range
(
len
(
querylist
)):
query_left
=
querylist
[
i
]
for
j
in
range
(
i
+
1
,
len
(
querylist
)):
query_right
=
querylist
[
j
]
if
query_left
.
relevance_score
>
query_right
.
relevance_score
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
else
:
yield
1
,
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
k
+=
1
else
:
raise
ValueError
(
"unsupport parameter of partial_order, Only can be neighbour or full"
)
labels
.
append
(
1
)
docpairs
.
append
([
np
.
array
(
query_left
.
feature_vector
),
np
.
array
(
query_right
.
feature_vector
)
])
elif
query_left
.
relevance_score
<
query_right
.
relevance_score
:
labels
.
append
(
1
)
docpairs
.
append
([
np
.
array
(
query_right
.
feature_vector
),
np
.
array
(
query_left
.
feature_vector
)
])
for
label
,
pair
in
zip
(
labels
,
docpairs
):
yield
label
,
pair
[
0
],
pair
[
1
]
def
gen_list
(
querylist
):
...
...
@@ -201,12 +241,30 @@ def gen_list(querylist):
"""
if
not
isinstance
(
querylist
,
QueryList
):
querylist
=
QueryList
(
querylist
)
#
querylist._correct_ranking_()
querylist
.
_correct_ranking_
()
relevance_score_list
=
[
query
.
relevance_score
for
query
in
querylist
]
feature_vector_list
=
[
query
.
feature_vector
for
query
in
querylist
]
yield
np
.
array
(
relevance_score_list
).
T
,
np
.
array
(
feature_vector_list
)
def
query_filter
(
querylists
):
"""
filter query get only document with label 0.
label 0, 1, 2 means the relevance score document with query
parameters :
querylist : QueyList list
return :
querylist : QueyList list
"""
filter_query
=
[]
for
querylist
in
querylists
:
relevance_score_list
=
[
query
.
relevance_score
for
query
in
querylist
]
if
sum
(
relevance_score_list
)
!=
.
0
:
filter_query
.
append
(
querylist
)
return
filter_query
def
load_from_text
(
filepath
,
shuffle
=
True
,
fill_missing
=-
1
):
"""
parse data file into querys
...
...
@@ -219,12 +277,16 @@ def load_from_text(filepath, shuffle=True, fill_missing=-1):
for
line
in
f
:
query
=
Query
()
query
=
query
.
_parse_
(
line
)
if
query
==
None
:
continue
if
query
.
query_id
!=
prev_query_id
:
if
querylist
is
not
None
:
querylists
.
append
(
querylist
)
querylist
=
QueryList
()
prev_query_id
=
query
.
query_id
querylist
.
_add_query
(
query
)
if
querylist
is
not
None
:
querylists
.
append
(
querylist
)
if
shuffle
==
True
:
random
.
shuffle
(
querylists
)
return
querylists
...
...
@@ -244,10 +306,15 @@ def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
label query_left, query_right # format = "pairwise"
label querylist # format = "listwise"
"""
querylists
=
load_from_text
(
filepath
,
shuffle
=
shuffle
,
fill_missing
=
fill_missing
)
querylists
=
query_filter
(
load_from_text
(
filepath
,
shuffle
=
shuffle
,
fill_missing
=
fill_missing
))
for
querylist
in
querylists
:
if
format
==
"pairwise"
:
if
format
==
"plain_txt"
:
yield
next
(
gen_plain_txt
(
querylist
))
elif
format
==
"pointwise"
:
yield
next
(
gen_point
(
querylist
))
elif
format
==
"pairwise"
:
for
pair
in
gen_pair
(
querylist
):
yield
pair
elif
format
==
"listwise"
:
...
...
@@ -264,7 +331,7 @@ def fetch():
if
__name__
==
"__main__"
:
fetch
()
for
i
,
(
score
,
samples
)
in
enumerate
(
train
(
format
=
"listwise"
,
shuffle
=
False
)
):
np
.
savetxt
(
"query_%d"
%
(
i
),
score
,
fmt
=
"%.2f"
)
mytest
=
functools
.
partial
(
__reader__
,
filepath
=
"MQ2007/MQ2007/Fold1/sample"
,
format
=
"listwise"
)
for
label
,
query
in
mytest
(
):
print
label
,
query
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录