Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
e7ff8e50
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e7ff8e50
编写于
7月 16, 2020
作者:
Y
yinhaofeng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add test dict
上级
3ac2d656
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
408 addition
and
0 deletion
+408
-0
models/match/match-pyramid/data/process.py
models/match/match-pyramid/data/process.py
+152
-0
models/match/match-pyramid/data/relation.test.fold1.txt
models/match/match-pyramid/data/relation.test.fold1.txt
+256
-0
未找到文件。
models/match/match-pyramid/data/process.py
0 → 100644
浏览文件 @
e7ff8e50
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
numpy
as
np
import
random
# Read Word Dict and Inverse Word Dict
def
read_word_dict
(
filename
):
word_dict
=
{}
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
word_dict
[
int
(
line
[
1
])]
=
line
[
0
]
print
(
'[%s]
\n\t
Word dict size: %d'
%
(
filename
,
len
(
word_dict
)))
return
word_dict
# Read Embedding File
def
read_embedding
(
filename
):
embed
=
{}
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
embed
[
int
(
line
[
0
])]
=
list
(
map
(
float
,
line
[
1
:]))
print
(
'[%s]
\n\t
Embedding size: %d'
%
(
filename
,
len
(
embed
)))
return
embed
# Convert Embedding Dict 2 numpy array
def
convert_embed_2_numpy
(
embed_dict
,
embed
=
None
):
for
k
in
embed_dict
:
embed
[
k
]
=
np
.
array
(
embed_dict
[
k
])
print
(
'Generate numpy embed:'
,
embed
.
shape
)
return
embed
# Read Data
def
read_data
(
filename
):
data
=
{}
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
data
[
line
[
0
]]
=
list
(
map
(
int
,
line
[
2
:]))
print
(
'[%s]
\n\t
Data size: %s'
%
(
filename
,
len
(
data
)))
return
data
# Read Relation Data
def
read_relation
(
filename
):
data
=
[]
for
line
in
open
(
filename
):
line
=
line
.
strip
().
split
()
data
.
append
((
int
(
line
[
0
]),
line
[
1
],
line
[
2
]))
print
(
'[%s]
\n\t
Instance size: %s'
%
(
filename
,
len
(
data
)))
return
data
Letor07Path
=
"./data"
word_dict
=
read_word_dict
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'word_dict.txt'
))
query_data
=
read_data
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'qid_query.txt'
))
doc_data
=
read_data
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'docid_doc.txt'
))
embed_dict
=
read_embedding
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'embed_wiki-pdc_d50_norm'
))
_PAD_
=
len
(
word_dict
)
#193367
embed_dict
[
_PAD_
]
=
np
.
zeros
((
50
,
),
dtype
=
np
.
float32
)
word_dict
[
_PAD_
]
=
'[PAD]'
W_init_embed
=
np
.
float32
(
np
.
random
.
uniform
(
-
0.02
,
0.02
,
[
len
(
word_dict
),
50
]))
embedding
=
convert_embed_2_numpy
(
embed_dict
,
embed
=
W_init_embed
)
np
.
save
(
"embedding.npy"
,
embedding
)
batch_size
=
64
data1_maxlen
=
20
data2_maxlen
=
500
embed_size
=
50
train_iters
=
2500
def
make_train
():
rel_set
=
{}
pair_list
=
[]
rel
=
read_relation
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'relation.train.fold1.txt'
))
for
label
,
d1
,
d2
in
rel
:
if
d1
not
in
rel_set
:
rel_set
[
d1
]
=
{}
if
label
not
in
rel_set
[
d1
]:
rel_set
[
d1
][
label
]
=
[]
rel_set
[
d1
][
label
].
append
(
d2
)
for
d1
in
rel_set
:
label_list
=
sorted
(
rel_set
[
d1
].
keys
(),
reverse
=
True
)
for
hidx
,
high_label
in
enumerate
(
label_list
[:
-
1
]):
for
low_label
in
label_list
[
hidx
+
1
:]:
for
high_d2
in
rel_set
[
d1
][
high_label
]:
for
low_d2
in
rel_set
[
d1
][
low_label
]:
pair_list
.
append
((
d1
,
high_d2
,
low_d2
))
print
(
'Pair Instance Count:'
,
len
(
pair_list
))
f
=
open
(
"./data/train/train.txt"
,
"w"
)
for
batch
in
range
(
800
):
X1
=
np
.
zeros
((
batch_size
*
2
,
data1_maxlen
),
dtype
=
np
.
int32
)
X2
=
np
.
zeros
((
batch_size
*
2
,
data2_maxlen
),
dtype
=
np
.
int32
)
X1
[:]
=
_PAD_
X2
[:]
=
_PAD_
for
i
in
range
(
batch_size
):
d1
,
d2p
,
d2n
=
random
.
choice
(
pair_list
)
d1_len
=
min
(
data1_maxlen
,
len
(
query_data
[
d1
]))
d2p_len
=
min
(
data2_maxlen
,
len
(
doc_data
[
d2p
]))
d2n_len
=
min
(
data2_maxlen
,
len
(
doc_data
[
d2n
]))
X1
[
i
,
:
d1_len
]
=
query_data
[
d1
][:
d1_len
]
X2
[
i
,
:
d2p_len
]
=
doc_data
[
d2p
][:
d2p_len
]
X1
[
i
+
batch_size
,
:
d1_len
]
=
query_data
[
d1
][:
d1_len
]
X2
[
i
+
batch_size
,
:
d2n_len
]
=
doc_data
[
d2n
][:
d2n_len
]
for
i
in
range
(
batch_size
*
2
):
q
=
[
str
(
x
)
for
x
in
list
(
X1
[
i
])]
d
=
[
str
(
x
)
for
x
in
list
(
X2
[
i
])]
f
.
write
(
","
.
join
(
q
)
+
"
\t
"
+
","
.
join
(
d
)
+
"
\n
"
)
f
.
close
()
def
make_test
():
rel
=
read_relation
(
filename
=
os
.
path
.
join
(
Letor07Path
,
'relation.test.fold1.txt'
))
f
=
open
(
"./data/test/test.txt"
,
"w"
)
for
label
,
d1
,
d2
in
rel
:
X1
=
np
.
zeros
(
data1_maxlen
,
dtype
=
np
.
int32
)
X2
=
np
.
zeros
(
data2_maxlen
,
dtype
=
np
.
int32
)
X1
[:]
=
_PAD_
X2
[:]
=
_PAD_
d1_len
=
min
(
data1_maxlen
,
len
(
query_data
[
d1
]))
d2_len
=
min
(
data2_maxlen
,
len
(
doc_data
[
d2
]))
X1
[:
d1_len
]
=
query_data
[
d1
][:
d1_len
]
X2
[:
d2_len
]
=
doc_data
[
d2
][:
d2_len
]
q
=
[
str
(
x
)
for
x
in
list
(
X1
)]
d
=
[
str
(
x
)
for
x
in
list
(
X2
)]
f
.
write
(
","
.
join
(
q
)
+
"
\t
"
+
","
.
join
(
d
)
+
"
\t
"
+
str
(
label
)
+
"
\t
"
+
d1
+
"
\n
"
)
f
.
close
()
make_train
()
make_test
()
models/match/match-pyramid/data/relation.test.fold1.txt
0 → 100644
浏览文件 @
e7ff8e50
2 9639 GX099-60-3149248
1 9639 GX028-47-6554966
1 9639 GX031-84-2802741
1 9639 GX031-86-1702683
1 9639 GX031-89-11392170
1 9639 GX035-46-10142187
1 9639 GX039-07-1333080
1 9639 GX040-05-15096071
1 9639 GX045-35-10693225
1 9639 GX045-74-6226888
1 9639 GX046-31-8871083
1 9639 GX046-56-6274894
1 9639 GX050-09-14629105
1 9639 GX097-05-12714275
1 9639 GX101-06-7768196
1 9639 GX124-50-4934142
1 9639 GX259-01-13320140
1 9639 GX259-50-8109630
1 9639 GX259-72-16176934
1 9639 GX259-98-7821925
1 9639 GX260-27-13260880
1 9639 GX260-54-6363694
1 9639 GX260-78-6999656
1 9639 GX261-04-0843988
1 9639 GX261-23-4964814
0 9639 GX021-75-7026755
0 9639 GX021-80-16449591
0 9639 GX025-40-7135810
0 9639 GX031-89-9020252
0 9639 GX037-45-0533209
0 9639 GX038-17-11223353
0 9639 GX057-07-13335832
0 9639 GX081-50-12756687
0 9639 GX124-43-2364716
0 9639 GX129-60-0000000
0 9639 GX219-07-7475581
0 9639 GX233-90-7976935
0 9639 GX267-49-2983064
0 9639 GX267-74-2413254
0 9639 GX270-05-13614294
1 9329 GX234-05-0812081
0 9329 GX000-00-0000000
0 9329 GX008-50-3899336
0 9329 GX011-75-8470249
0 9329 GX020-42-13388867
0 9329 GX024-91-8520306
0 9329 GX026-88-6087429
0 9329 GX027-22-1703847
0 9329 GX034-11-2617393
0 9329 GX036-02-7994497
0 9329 GX046-08-13858054
0 9329 GX059-85-11403109
0 9329 GX099-37-0232298
0 9329 GX099-46-11473306
0 9329 GX108-04-9589788
0 9329 GX110-50-11723940
0 9329 GX124-11-4119164
0 9329 GX149-82-15204191
0 9329 GX165-95-6198495
0 9329 GX225-56-4184936
0 9329 GX229-57-4487470
0 9329 GX230-37-4125963
0 9329 GX231-40-14574318
0 9329 GX238-44-10302536
0 9329 GX239-85-8572461
0 9329 GX244-17-10154048
0 9329 GX245-16-4169590
0 9329 GX245-46-6341859
0 9329 GX246-91-8487173
0 9329 GX262-88-13259441
0 9329 GX263-41-4135561
0 9329 GX264-07-6385713
0 9329 GX264-38-12253757
0 9329 GX264-90-15990025
0 9329 GX265-89-6212449
0 9329 GX268-41-12034794
0 9329 GX268-83-5140660
0 9329 GX270-46-0293828
0 9329 GX270-64-11852140
0 9329 GX271-10-12458597
2 9326 GX272-03-6610348
1 9326 GX011-12-0595978
0 9326 GX000-00-0000000
0 9326 GX000-38-9492606
0 9326 GX000-84-4587136
0 9326 GX002-41-5566464
0 9326 GX002-51-2615036
0 9326 GX004-56-12238694
0 9326 GX004-72-2476906
0 9326 GX008-13-1835206
0 9326 GX008-64-7705528
0 9326 GX009-87-0976731
0 9326 GX012-24-7688369
0 9326 GX012-96-8727608
0 9326 GX023-87-16736657
0 9326 GX025-21-11820239
0 9326 GX025-22-15113698
0 9326 GX025-51-13959128
0 9326 GX025-57-11414648
0 9326 GX025-64-7587631
0 9326 GX027-62-4542881
0 9326 GX031-25-4759403
0 9326 GX036-10-7902858
0 9326 GX047-04-9457544
0 9326 GX047-06-4014803
0 9326 GX048-00-15113058
0 9326 GX048-02-12975919
0 9326 GX048-78-3273874
0 9326 GX235-35-0963257
0 9326 GX235-98-3789570
0 9326 GX236-51-15473637
0 9326 GX237-96-0892713
0 9326 GX239-35-7413891
0 9326 GX239-95-0176537
0 9326 GX251-34-10377030
0 9326 GX254-19-11374782
0 9326 GX260-63-10533444
0 9326 GX265-94-14886230
0 9326 GX269-78-1500497
0 9326 GX270-59-10270517
2 8946 GX046-79-6984659
2 8946 GX148-33-1869479
2 8946 GX252-36-12638222
1 8946 GX017-47-13290921
1 8946 GX030-69-3218092
1 8946 GX034-82-4550348
1 8946 GX044-01-9283107
1 8946 GX047-98-6660623
1 8946 GX057-96-12580825
1 8946 GX059-94-12068143
1 8946 GX060-13-13600036
1 8946 GX060-74-6594973
1 8946 GX093-08-1158999
0 8946 GX000-00-0000000
0 8946 GX000-42-15811803
0 8946 GX000-81-16418910
0 8946 GX008-38-10557859
0 8946 GX011-01-10891808
0 8946 GX013-71-5708874
0 8946 GX015-72-4458924
0 8946 GX023-91-9869060
0 8946 GX027-56-6376748
0 8946 GX037-11-10829529
0 8946 GX038-55-0681330
0 8946 GX043-86-4200105
0 8946 GX047-52-3712485
0 8946 GX053-77-4836617
0 8946 GX070-62-1070063
0 8946 GX105-53-13372327
0 8946 GX218-61-6263172
0 8946 GX223-72-13625320
0 8946 GX230-68-14727182
0 8946 GX235-34-7733230
0 8946 GX251-73-0159347
0 8946 GX254-47-1098586
0 8946 GX263-76-6934681
0 8946 GX263-84-8668756
0 8946 GX264-70-14223639
0 8946 GX269-12-5910753
0 8946 GX271-93-9895614
1 9747 GX006-77-1973537
1 9747 GX244-83-8716953
1 9747 GX269-92-7189826
0 9747 GX000-00-0000000
0 9747 GX001-51-8693413
0 9747 GX003-10-2820641
0 9747 GX003-74-0557776
0 9747 GX003-79-13695689
0 9747 GX009-57-0938999
0 9747 GX009-59-8595527
0 9747 GX009-80-10629348
0 9747 GX010-37-0206372
0 9747 GX013-46-2187318
0 9747 GX014-58-4004859
0 9747 GX015-79-5393654
0 9747 GX032-50-7316370
0 9747 GX049-33-2206612
0 9747 GX050-34-0439256
0 9747 GX062-76-0914936
0 9747 GX065-73-7392661
0 9747 GX148-27-15770966
0 9747 GX155-71-0504939
0 9747 GX229-75-14750078
0 9747 GX231-01-0640962
0 9747 GX236-45-15598812
0 9747 GX247-19-9516715
0 9747 GX247-34-4277646
0 9747 GX247-63-10766287
0 9747 GX248-23-15998266
0 9747 GX249-85-9742193
0 9747 GX250-31-7671617
0 9747 GX252-56-2141580
0 9747 GX253-15-3406713
0 9747 GX264-07-15838087
0 9747 GX264-43-6543997
0 9747 GX266-18-14688076
0 9747 GX267-50-2036010
0 9747 GX268-28-0548507
0 9747 GX269-49-14171555
0 9747 GX269-63-15607386
2 9740 GX005-94-14208849
2 9740 GX008-51-5639660
2 9740 GX012-37-2342061
2 9740 GX019-75-13916532
2 9740 GX074-76-16261807
2 9740 GX077-07-2951943
2 9740 GX229-28-11068981
2 9740 GX237-80-7497206
2 9740 GX257-53-10589749
2 9740 GX258-06-0611419
2 9740 GX268-55-9791226
1 9740 GX007-62-1126118
1 9740 GX015-78-0216468
1 9740 GX038-65-1678199
1 9740 GX041-25-14803324
1 9740 GX063-71-0401425
1 9740 GX077-08-15801730
1 9740 GX098-07-2885671
1 9740 GX135-28-6485892
1 9740 GX228-85-10518518
1 9740 GX231-93-11279468
1 9740 GX234-70-15061254
1 9740 GX236-31-11149347
1 9740 GX240-68-1184464
1 9740 GX248-03-7275316
1 9740 GX253-11-9846012
1 9740 GX255-05-10638500
1 9740 GX267-73-4450097
1 9740 GX269-19-0642640
0 9740 GX001-74-5132048
0 9740 GX001-88-2603815
0 9740 GX004-83-7935833
0 9740 GX007-01-16750210
0 9740 GX040-11-5249209
0 9740 GX042-38-2886005
0 9740 GX052-20-4359789
0 9740 GX067-74-3718011
0 9740 GX077-01-13481396
0 9740 GX242-92-8868913
0 9740 GX262-74-4596688
2 8835 GX010-99-5715419
2 8835 GX049-99-2518724
0 8835 GX000-00-0000000
0 8835 GX007-91-6779497
0 8835 GX008-14-0788708
0 8835 GX008-15-13942125
0 8835 GX011-58-14336551
0 8835 GX012-79-10684001
0 8835 GX013-00-10822427
0 8835 GX013-03-5962783
0 8835 GX015-54-0251701
0 8835 GX017-36-5859317
0 8835 GX017-60-0601078
0 8835 GX027-24-16202205
0 8835 GX030-11-15814183
0 8835 GX030-76-11969233
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录