Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
e7ff8e50
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e7ff8e50
编写于
7月 16, 2020
作者:
Y
yinhaofeng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add test dict
上级
3ac2d656
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
408 addition
and
0 deletion
+408
-0
models/match/match-pyramid/data/process.py
models/match/match-pyramid/data/process.py
+152
-0
models/match/match-pyramid/data/relation.test.fold1.txt
models/match/match-pyramid/data/relation.test.fold1.txt
+256
-0
未找到文件。
models/match/match-pyramid/data/process.py
0 → 100644
浏览文件 @
e7ff8e50
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
numpy
as
np
import
random
# Read Word Dict and Inverse Word Dict
def
read_word_dict
(
filename
):
word_dict
=
{}
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
word_dict
[
int
(
line
[
1
])]
=
line
[
0
]
print
(
'[%s]
\n\t
Word dict size: %d'
%
(
filename
,
len
(
word_dict
)))
return
word_dict
# Read Embedding File
def
read_embedding
(
filename
):
embed
=
{}
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
embed
[
int
(
line
[
0
])]
=
list
(
map
(
float
,
line
[
1
:]))
print
(
'[%s]
\n\t
Embedding size: %d'
%
(
filename
,
len
(
embed
)))
return
embed
# Convert Embedding Dict 2 numpy array
def
convert_embed_2_numpy
(
embed_dict
,
embed
=
None
):
for
k
in
embed_dict
:
embed
[
k
]
=
np
.
array
(
embed_dict
[
k
])
print
(
'Generate numpy embed:'
,
embed
.
shape
)
return
embed
# Read Data
def
read_data
(
filename
):
data
=
{}
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
data
[
line
[
0
]]
=
list
(
map
(
int
,
line
[
2
:]))
print
(
'[%s]
\n\t
Data size: %s'
%
(
filename
,
len
(
data
)))
return
data
# Read Relation Data
def
read_relation
(
filename
):
data
=
[]
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
data
.
append
((
int
(
line
[
0
]),
line
[
1
],
line
[
2
]))
print
(
'[%s]
\n\t
Instance size: %s'
%
(
filename
,
len
(
data
)))
return
data
Letor07Path
=
"./data"
word_dict
=
read_word_dict
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'word_dict.txt'
))
query_data
=
read_data
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'qid_query.txt'
))
doc_data
=
read_data
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'docid_doc.txt'
))
embed_dict
=
read_embedding
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'embed_wiki-pdc_d50_norm'
))
_PAD_
=
len
(
word_dict
)
#193367
embed_dict
[
_PAD_
]
=
np
.
zeros
((
50
,
),
dtype
=
np
.
float32
)
word_dict
[
_PAD_
]
=
'[PAD]'
W_init_embed
=
np
.
float32
(
np
.
random
.
uniform
(
-
0.02
,
0.02
,
[
len
(
word_dict
),
50
]))
embedding
=
convert_embed_2_numpy
(
embed_dict
,
embed
=
W_init_embed
)
np
.
save
(
"embedding.npy"
,
embedding
)
batch_size
=
64
data1_maxlen
=
20
data2_maxlen
=
500
embed_size
=
50
train_iters
=
2500
def
make_train
():
rel_set
=
{}
pair_list
=
[]
rel
=
read_relation
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'relation.train.fold1.txt'
))
for
label
,
d1
,
d2
in
rel
:
if
d1
not
in
rel_set
:
rel_set
[
d1
]
=
{}
if
label
not
in
rel_set
[
d1
]:
rel_set
[
d1
][
label
]
=
[]
rel_set
[
d1
][
label
].
append
(
d2
)
for
d1
in
rel_set
:
label_list
=
sorted
(
rel_set
[
d1
].
keys
(),
reverse
=
True
)
for
hidx
,
high_label
in
enumerate
(
label_list
[:
-
1
]):
for
low_label
in
label_list
[
hidx
+
1
:]:
for
high_d2
in
rel_set
[
d1
][
high_label
]:
for
low_d2
in
rel_set
[
d1
][
low_label
]:
pair_list
.
append
((
d1
,
high_d2
,
low_d2
))
print
(
'Pair Instance Count:'
,
len
(
pair_list
))
f
=
open
(
"./data/train/train.txt"
,
"w"
)
for
batch
in
range
(
800
):
X1
=
np
.
zeros
((
batch_size
*
2
,
data1_maxlen
),
dtype
=
np
.
int32
)
X2
=
np
.
zeros
((
batch_size
*
2
,
data2_maxlen
),
dtype
=
np
.
int32
)
X1
[:]
=
_PAD_
X2
[:]
=
_PAD_
for
i
in
range
(
batch_size
):
d1
,
d2p
,
d2n
=
random
.
choice
(
pair_list
)
d1_len
=
min
(
data1_maxlen
,
len
(
query_data
[
d1
]))
d2p_len
=
min
(
data2_maxlen
,
len
(
doc_data
[
d2p
]))
d2n_len
=
min
(
data2_maxlen
,
len
(
doc_data
[
d2n
]))
X1
[
i
,
:
d1_len
]
=
query_data
[
d1
][:
d1_len
]
X2
[
i
,
:
d2p_len
]
=
doc_data
[
d2p
][:
d2p_len
]
X1
[
i
+
batch_size
,
:
d1_len
]
=
query_data
[
d1
][:
d1_len
]
X2
[
i
+
batch_size
,
:
d2n_len
]
=
doc_data
[
d2n
][:
d2n_len
]
for
i
in
range
(
batch_size
*
2
):
q
=
[
str
(
x
)
for
x
in
list
(
X1
[
i
])]
d
=
[
str
(
x
)
for
x
in
list
(
X2
[
i
])]
f
.
write
(
","
.
join
(
q
)
+
"
\t
"
+
","
.
join
(
d
)
+
"
\n
"
)
f
.
close
()
def
make_test
():
rel
=
read_relation
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'relation.test.fold1.txt'
))
f
=
open
(
"./data/test/test.txt"
,
"w"
)
for
label
,
d1
,
d2
in
rel
:
X1
=
np
.
zeros
(
data1_maxlen
,
dtype
=
np
.
int32
)
X2
=
np
.
zeros
(
data2_maxlen
,
dtype
=
np
.
int32
)
X1
[:]
=
_PAD_
X2
[:]
=
_PAD_
d1_len
=
min
(
data1_maxlen
,
len
(
query_data
[
d1
]))
d2_len
=
min
(
data2_maxlen
,
len
(
doc_data
[
d2
]))
X1
[:
d1_len
]
=
query_data
[
d1
][:
d1_len
]
X2
[:
d2_len
]
=
doc_data
[
d2
][:
d2_len
]
q
=
[
str
(
x
)
for
x
in
list
(
X1
)]
d
=
[
str
(
x
)
for
x
in
list
(
X2
)]
f
.
write
(
","
.
join
(
q
)
+
"
\t
"
+
","
.
join
(
d
)
+
"
\t
"
+
str
(
label
)
+
"
\t
"
+
d1
+
"
\n
"
)
f
.
close
()
make_train
()
make_test
()
models/match/match-pyramid/data/relation.test.fold1.txt
0 → 100644
浏览文件 @
e7ff8e50
2 9639 GX099-60-3149248
1 9639 GX028-47-6554966
1 9639 GX031-84-2802741
1 9639 GX031-86-1702683
1 9639 GX031-89-11392170
1 9639 GX035-46-10142187
1 9639 GX039-07-1333080
1 9639 GX040-05-15096071
1 9639 GX045-35-10693225
1 9639 GX045-74-6226888
1 9639 GX046-31-8871083
1 9639 GX046-56-6274894
1 9639 GX050-09-14629105
1 9639 GX097-05-12714275
1 9639 GX101-06-7768196
1 9639 GX124-50-4934142
1 9639 GX259-01-13320140
1 9639 GX259-50-8109630
1 9639 GX259-72-16176934
1 9639 GX259-98-7821925
1 9639 GX260-27-13260880
1 9639 GX260-54-6363694
1 9639 GX260-78-6999656
1 9639 GX261-04-0843988
1 9639 GX261-23-4964814
0 9639 GX021-75-7026755
0 9639 GX021-80-16449591
0 9639 GX025-40-7135810
0 9639 GX031-89-9020252
0 9639 GX037-45-0533209
0 9639 GX038-17-11223353
0 9639 GX057-07-13335832
0 9639 GX081-50-12756687
0 9639 GX124-43-2364716
0 9639 GX129-60-0000000
0 9639 GX219-07-7475581
0 9639 GX233-90-7976935
0 9639 GX267-49-2983064
0 9639 GX267-74-2413254
0 9639 GX270-05-13614294
1 9329 GX234-05-0812081
0 9329 GX000-00-0000000
0 9329 GX008-50-3899336
0 9329 GX011-75-8470249
0 9329 GX020-42-13388867
0 9329 GX024-91-8520306
0 9329 GX026-88-6087429
0 9329 GX027-22-1703847
0 9329 GX034-11-2617393
0 9329 GX036-02-7994497
0 9329 GX046-08-13858054
0 9329 GX059-85-11403109
0 9329 GX099-37-0232298
0 9329 GX099-46-11473306
0 9329 GX108-04-9589788
0 9329 GX110-50-11723940
0 9329 GX124-11-4119164
0 9329 GX149-82-15204191
0 9329 GX165-95-6198495
0 9329 GX225-56-4184936
0 9329 GX229-57-4487470
0 9329 GX230-37-4125963
0 9329 GX231-40-14574318
0 9329 GX238-44-10302536
0 9329 GX239-85-8572461
0 9329 GX244-17-10154048
0 9329 GX245-16-4169590
0 9329 GX245-46-6341859
0 9329 GX246-91-8487173
0 9329 GX262-88-13259441
0 9329 GX263-41-4135561
0 9329 GX264-07-6385713
0 9329 GX264-38-12253757
0 9329 GX264-90-15990025
0 9329 GX265-89-6212449
0 9329 GX268-41-12034794
0 9329 GX268-83-5140660
0 9329 GX270-46-0293828
0 9329 GX270-64-11852140
0 9329 GX271-10-12458597
2 9326 GX272-03-6610348
1 9326 GX011-12-0595978
0 9326 GX000-00-0000000
0 9326 GX000-38-9492606
0 9326 GX000-84-4587136
0 9326 GX002-41-5566464
0 9326 GX002-51-2615036
0 9326 GX004-56-12238694
0 9326 GX004-72-2476906
0 9326 GX008-13-1835206
0 9326 GX008-64-7705528
0 9326 GX009-87-0976731
0 9326 GX012-24-7688369
0 9326 GX012-96-8727608
0 9326 GX023-87-16736657
0 9326 GX025-21-11820239
0 9326 GX025-22-15113698
0 9326 GX025-51-13959128
0 9326 GX025-57-11414648
0 9326 GX025-64-7587631
0 9326 GX027-62-4542881
0 9326 GX031-25-4759403
0 9326 GX036-10-7902858
0 9326 GX047-04-9457544
0 9326 GX047-06-4014803
0 9326 GX048-00-15113058
0 9326 GX048-02-12975919
0 9326 GX048-78-3273874
0 9326 GX235-35-0963257
0 9326 GX235-98-3789570
0 9326 GX236-51-15473637
0 9326 GX237-96-0892713
0 9326 GX239-35-7413891
0 9326 GX239-95-0176537
0 9326 GX251-34-10377030
0 9326 GX254-19-11374782
0 9326 GX260-63-10533444
0 9326 GX265-94-14886230
0 9326 GX269-78-1500497
0 9326 GX270-59-10270517
2 8946 GX046-79-6984659
2 8946 GX148-33-1869479
2 8946 GX252-36-12638222
1 8946 GX017-47-13290921
1 8946 GX030-69-3218092
1 8946 GX034-82-4550348
1 8946 GX044-01-9283107
1 8946 GX047-98-6660623
1 8946 GX057-96-12580825
1 8946 GX059-94-12068143
1 8946 GX060-13-13600036
1 8946 GX060-74-6594973
1 8946 GX093-08-1158999
0 8946 GX000-00-0000000
0 8946 GX000-42-15811803
0 8946 GX000-81-16418910
0 8946 GX008-38-10557859
0 8946 GX011-01-10891808
0 8946 GX013-71-5708874
0 8946 GX015-72-4458924
0 8946 GX023-91-9869060
0 8946 GX027-56-6376748
0 8946 GX037-11-10829529
0 8946 GX038-55-0681330
0 8946 GX043-86-4200105
0 8946 GX047-52-3712485
0 8946 GX053-77-4836617
0 8946 GX070-62-1070063
0 8946 GX105-53-13372327
0 8946 GX218-61-6263172
0 8946 GX223-72-13625320
0 8946 GX230-68-14727182
0 8946 GX235-34-7733230
0 8946 GX251-73-0159347
0 8946 GX254-47-1098586
0 8946 GX263-76-6934681
0 8946 GX263-84-8668756
0 8946 GX264-70-14223639
0 8946 GX269-12-5910753
0 8946 GX271-93-9895614
1 9747 GX006-77-1973537
1 9747 GX244-83-8716953
1 9747 GX269-92-7189826
0 9747 GX000-00-0000000
0 9747 GX001-51-8693413
0 9747 GX003-10-2820641
0 9747 GX003-74-0557776
0 9747 GX003-79-13695689
0 9747 GX009-57-0938999
0 9747 GX009-59-8595527
0 9747 GX009-80-10629348
0 9747 GX010-37-0206372
0 9747 GX013-46-2187318
0 9747 GX014-58-4004859
0 9747 GX015-79-5393654
0 9747 GX032-50-7316370
0 9747 GX049-33-2206612
0 9747 GX050-34-0439256
0 9747 GX062-76-0914936
0 9747 GX065-73-7392661
0 9747 GX148-27-15770966
0 9747 GX155-71-0504939
0 9747 GX229-75-14750078
0 9747 GX231-01-0640962
0 9747 GX236-45-15598812
0 9747 GX247-19-9516715
0 9747 GX247-34-4277646
0 9747 GX247-63-10766287
0 9747 GX248-23-15998266
0 9747 GX249-85-9742193
0 9747 GX250-31-7671617
0 9747 GX252-56-2141580
0 9747 GX253-15-3406713
0 9747 GX264-07-15838087
0 9747 GX264-43-6543997
0 9747 GX266-18-14688076
0 9747 GX267-50-2036010
0 9747 GX268-28-0548507
0 9747 GX269-49-14171555
0 9747 GX269-63-15607386
2 9740 GX005-94-14208849
2 9740 GX008-51-5639660
2 9740 GX012-37-2342061
2 9740 GX019-75-13916532
2 9740 GX074-76-16261807
2 9740 GX077-07-2951943
2 9740 GX229-28-11068981
2 9740 GX237-80-7497206
2 9740 GX257-53-10589749
2 9740 GX258-06-0611419
2 9740 GX268-55-9791226
1 9740 GX007-62-1126118
1 9740 GX015-78-0216468
1 9740 GX038-65-1678199
1 9740 GX041-25-14803324
1 9740 GX063-71-0401425
1 9740 GX077-08-15801730
1 9740 GX098-07-2885671
1 9740 GX135-28-6485892
1 9740 GX228-85-10518518
1 9740 GX231-93-11279468
1 9740 GX234-70-15061254
1 9740 GX236-31-11149347
1 9740 GX240-68-1184464
1 9740 GX248-03-7275316
1 9740 GX253-11-9846012
1 9740 GX255-05-10638500
1 9740 GX267-73-4450097
1 9740 GX269-19-0642640
0 9740 GX001-74-5132048
0 9740 GX001-88-2603815
0 9740 GX004-83-7935833
0 9740 GX007-01-16750210
0 9740 GX040-11-5249209
0 9740 GX042-38-2886005
0 9740 GX052-20-4359789
0 9740 GX067-74-3718011
0 9740 GX077-01-13481396
0 9740 GX242-92-8868913
0 9740 GX262-74-4596688
2 8835 GX010-99-5715419
2 8835 GX049-99-2518724
0 8835 GX000-00-0000000
0 8835 GX007-91-6779497
0 8835 GX008-14-0788708
0 8835 GX008-15-13942125
0 8835 GX011-58-14336551
0 8835 GX012-79-10684001
0 8835 GX013-00-10822427
0 8835 GX013-03-5962783
0 8835 GX015-54-0251701
0 8835 GX017-36-5859317
0 8835 GX017-60-0601078
0 8835 GX027-24-16202205
0 8835 GX030-11-15814183
0 8835 GX030-76-11969233
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录