提交 e7ff8e50 编写于 作者: Y yinhaofeng

add test dict

上级 3ac2d656
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import random
# Read Word Dict and Inverse Word Dict
def read_word_dict(filename):
word_dict = {}
for line in open(filename):
line = line.strip().split()
word_dict[int(line[1])] = line[0]
print('[%s]\n\tWord dict size: %d' % (filename, len(word_dict)))
return word_dict
# Read Embedding File
def read_embedding(filename):
embed = {}
for line in open(filename):
line = line.strip().split()
embed[int(line[0])] = list(map(float, line[1:]))
print('[%s]\n\tEmbedding size: %d' % (filename, len(embed)))
return embed
# Convert Embedding Dict 2 numpy array
def convert_embed_2_numpy(embed_dict, embed=None):
for k in embed_dict:
embed[k] = np.array(embed_dict[k])
print('Generate numpy embed:', embed.shape)
return embed
# Read Data
def read_data(filename):
data = {}
for line in open(filename):
line = line.strip().split()
data[line[0]] = list(map(int, line[2:]))
print('[%s]\n\tData size: %s' % (filename, len(data)))
return data
# Read Relation Data
def read_relation(filename):
data = []
for line in open(filename):
line = line.strip().split()
data.append((int(line[0]), line[1], line[2]))
print('[%s]\n\tInstance size: %s' % (filename, len(data)))
return data
Letor07Path = "./data"
word_dict = read_word_dict(filename=os.path.join(Letor07Path, 'word_dict.txt'))
query_data = read_data(filename=os.path.join(Letor07Path, 'qid_query.txt'))
doc_data = read_data(filename=os.path.join(Letor07Path, 'docid_doc.txt'))
embed_dict = read_embedding(filename=os.path.join(Letor07Path,
'embed_wiki-pdc_d50_norm'))
_PAD_ = len(word_dict) #193367
embed_dict[_PAD_] = np.zeros((50, ), dtype=np.float32)
word_dict[_PAD_] = '[PAD]'
W_init_embed = np.float32(np.random.uniform(-0.02, 0.02, [len(word_dict), 50]))
embedding = convert_embed_2_numpy(embed_dict, embed=W_init_embed)
np.save("embedding.npy", embedding)
batch_size = 64
data1_maxlen = 20
data2_maxlen = 500
embed_size = 50
train_iters = 2500
def make_train():
rel_set = {}
pair_list = []
rel = read_relation(filename=os.path.join(Letor07Path,
'relation.train.fold1.txt'))
for label, d1, d2 in rel:
if d1 not in rel_set:
rel_set[d1] = {}
if label not in rel_set[d1]:
rel_set[d1][label] = []
rel_set[d1][label].append(d2)
for d1 in rel_set:
label_list = sorted(rel_set[d1].keys(), reverse=True)
for hidx, high_label in enumerate(label_list[:-1]):
for low_label in label_list[hidx + 1:]:
for high_d2 in rel_set[d1][high_label]:
for low_d2 in rel_set[d1][low_label]:
pair_list.append((d1, high_d2, low_d2))
print('Pair Instance Count:', len(pair_list))
f = open("./data/train/train.txt", "w")
for batch in range(800):
X1 = np.zeros((batch_size * 2, data1_maxlen), dtype=np.int32)
X2 = np.zeros((batch_size * 2, data2_maxlen), dtype=np.int32)
X1[:] = _PAD_
X2[:] = _PAD_
for i in range(batch_size):
d1, d2p, d2n = random.choice(pair_list)
d1_len = min(data1_maxlen, len(query_data[d1]))
d2p_len = min(data2_maxlen, len(doc_data[d2p]))
d2n_len = min(data2_maxlen, len(doc_data[d2n]))
X1[i, :d1_len] = query_data[d1][:d1_len]
X2[i, :d2p_len] = doc_data[d2p][:d2p_len]
X1[i + batch_size, :d1_len] = query_data[d1][:d1_len]
X2[i + batch_size, :d2n_len] = doc_data[d2n][:d2n_len]
for i in range(batch_size * 2):
q = [str(x) for x in list(X1[i])]
d = [str(x) for x in list(X2[i])]
f.write(",".join(q) + "\t" + ",".join(d) + "\n")
f.close()
def make_test():
rel = read_relation(filename=os.path.join(Letor07Path,
'relation.test.fold1.txt'))
f = open("./data/test/test.txt", "w")
for label, d1, d2 in rel:
X1 = np.zeros(data1_maxlen, dtype=np.int32)
X2 = np.zeros(data2_maxlen, dtype=np.int32)
X1[:] = _PAD_
X2[:] = _PAD_
d1_len = min(data1_maxlen, len(query_data[d1]))
d2_len = min(data2_maxlen, len(doc_data[d2]))
X1[:d1_len] = query_data[d1][:d1_len]
X2[:d2_len] = doc_data[d2][:d2_len]
q = [str(x) for x in list(X1)]
d = [str(x) for x in list(X2)]
f.write(",".join(q) + "\t" + ",".join(d) + "\t" + str(label) + "\t" +
d1 + "\n")
f.close()
make_train()
make_test()
2 9639 GX099-60-3149248
1 9639 GX028-47-6554966
1 9639 GX031-84-2802741
1 9639 GX031-86-1702683
1 9639 GX031-89-11392170
1 9639 GX035-46-10142187
1 9639 GX039-07-1333080
1 9639 GX040-05-15096071
1 9639 GX045-35-10693225
1 9639 GX045-74-6226888
1 9639 GX046-31-8871083
1 9639 GX046-56-6274894
1 9639 GX050-09-14629105
1 9639 GX097-05-12714275
1 9639 GX101-06-7768196
1 9639 GX124-50-4934142
1 9639 GX259-01-13320140
1 9639 GX259-50-8109630
1 9639 GX259-72-16176934
1 9639 GX259-98-7821925
1 9639 GX260-27-13260880
1 9639 GX260-54-6363694
1 9639 GX260-78-6999656
1 9639 GX261-04-0843988
1 9639 GX261-23-4964814
0 9639 GX021-75-7026755
0 9639 GX021-80-16449591
0 9639 GX025-40-7135810
0 9639 GX031-89-9020252
0 9639 GX037-45-0533209
0 9639 GX038-17-11223353
0 9639 GX057-07-13335832
0 9639 GX081-50-12756687
0 9639 GX124-43-2364716
0 9639 GX129-60-0000000
0 9639 GX219-07-7475581
0 9639 GX233-90-7976935
0 9639 GX267-49-2983064
0 9639 GX267-74-2413254
0 9639 GX270-05-13614294
1 9329 GX234-05-0812081
0 9329 GX000-00-0000000
0 9329 GX008-50-3899336
0 9329 GX011-75-8470249
0 9329 GX020-42-13388867
0 9329 GX024-91-8520306
0 9329 GX026-88-6087429
0 9329 GX027-22-1703847
0 9329 GX034-11-2617393
0 9329 GX036-02-7994497
0 9329 GX046-08-13858054
0 9329 GX059-85-11403109
0 9329 GX099-37-0232298
0 9329 GX099-46-11473306
0 9329 GX108-04-9589788
0 9329 GX110-50-11723940
0 9329 GX124-11-4119164
0 9329 GX149-82-15204191
0 9329 GX165-95-6198495
0 9329 GX225-56-4184936
0 9329 GX229-57-4487470
0 9329 GX230-37-4125963
0 9329 GX231-40-14574318
0 9329 GX238-44-10302536
0 9329 GX239-85-8572461
0 9329 GX244-17-10154048
0 9329 GX245-16-4169590
0 9329 GX245-46-6341859
0 9329 GX246-91-8487173
0 9329 GX262-88-13259441
0 9329 GX263-41-4135561
0 9329 GX264-07-6385713
0 9329 GX264-38-12253757
0 9329 GX264-90-15990025
0 9329 GX265-89-6212449
0 9329 GX268-41-12034794
0 9329 GX268-83-5140660
0 9329 GX270-46-0293828
0 9329 GX270-64-11852140
0 9329 GX271-10-12458597
2 9326 GX272-03-6610348
1 9326 GX011-12-0595978
0 9326 GX000-00-0000000
0 9326 GX000-38-9492606
0 9326 GX000-84-4587136
0 9326 GX002-41-5566464
0 9326 GX002-51-2615036
0 9326 GX004-56-12238694
0 9326 GX004-72-2476906
0 9326 GX008-13-1835206
0 9326 GX008-64-7705528
0 9326 GX009-87-0976731
0 9326 GX012-24-7688369
0 9326 GX012-96-8727608
0 9326 GX023-87-16736657
0 9326 GX025-21-11820239
0 9326 GX025-22-15113698
0 9326 GX025-51-13959128
0 9326 GX025-57-11414648
0 9326 GX025-64-7587631
0 9326 GX027-62-4542881
0 9326 GX031-25-4759403
0 9326 GX036-10-7902858
0 9326 GX047-04-9457544
0 9326 GX047-06-4014803
0 9326 GX048-00-15113058
0 9326 GX048-02-12975919
0 9326 GX048-78-3273874
0 9326 GX235-35-0963257
0 9326 GX235-98-3789570
0 9326 GX236-51-15473637
0 9326 GX237-96-0892713
0 9326 GX239-35-7413891
0 9326 GX239-95-0176537
0 9326 GX251-34-10377030
0 9326 GX254-19-11374782
0 9326 GX260-63-10533444
0 9326 GX265-94-14886230
0 9326 GX269-78-1500497
0 9326 GX270-59-10270517
2 8946 GX046-79-6984659
2 8946 GX148-33-1869479
2 8946 GX252-36-12638222
1 8946 GX017-47-13290921
1 8946 GX030-69-3218092
1 8946 GX034-82-4550348
1 8946 GX044-01-9283107
1 8946 GX047-98-6660623
1 8946 GX057-96-12580825
1 8946 GX059-94-12068143
1 8946 GX060-13-13600036
1 8946 GX060-74-6594973
1 8946 GX093-08-1158999
0 8946 GX000-00-0000000
0 8946 GX000-42-15811803
0 8946 GX000-81-16418910
0 8946 GX008-38-10557859
0 8946 GX011-01-10891808
0 8946 GX013-71-5708874
0 8946 GX015-72-4458924
0 8946 GX023-91-9869060
0 8946 GX027-56-6376748
0 8946 GX037-11-10829529
0 8946 GX038-55-0681330
0 8946 GX043-86-4200105
0 8946 GX047-52-3712485
0 8946 GX053-77-4836617
0 8946 GX070-62-1070063
0 8946 GX105-53-13372327
0 8946 GX218-61-6263172
0 8946 GX223-72-13625320
0 8946 GX230-68-14727182
0 8946 GX235-34-7733230
0 8946 GX251-73-0159347
0 8946 GX254-47-1098586
0 8946 GX263-76-6934681
0 8946 GX263-84-8668756
0 8946 GX264-70-14223639
0 8946 GX269-12-5910753
0 8946 GX271-93-9895614
1 9747 GX006-77-1973537
1 9747 GX244-83-8716953
1 9747 GX269-92-7189826
0 9747 GX000-00-0000000
0 9747 GX001-51-8693413
0 9747 GX003-10-2820641
0 9747 GX003-74-0557776
0 9747 GX003-79-13695689
0 9747 GX009-57-0938999
0 9747 GX009-59-8595527
0 9747 GX009-80-10629348
0 9747 GX010-37-0206372
0 9747 GX013-46-2187318
0 9747 GX014-58-4004859
0 9747 GX015-79-5393654
0 9747 GX032-50-7316370
0 9747 GX049-33-2206612
0 9747 GX050-34-0439256
0 9747 GX062-76-0914936
0 9747 GX065-73-7392661
0 9747 GX148-27-15770966
0 9747 GX155-71-0504939
0 9747 GX229-75-14750078
0 9747 GX231-01-0640962
0 9747 GX236-45-15598812
0 9747 GX247-19-9516715
0 9747 GX247-34-4277646
0 9747 GX247-63-10766287
0 9747 GX248-23-15998266
0 9747 GX249-85-9742193
0 9747 GX250-31-7671617
0 9747 GX252-56-2141580
0 9747 GX253-15-3406713
0 9747 GX264-07-15838087
0 9747 GX264-43-6543997
0 9747 GX266-18-14688076
0 9747 GX267-50-2036010
0 9747 GX268-28-0548507
0 9747 GX269-49-14171555
0 9747 GX269-63-15607386
2 9740 GX005-94-14208849
2 9740 GX008-51-5639660
2 9740 GX012-37-2342061
2 9740 GX019-75-13916532
2 9740 GX074-76-16261807
2 9740 GX077-07-2951943
2 9740 GX229-28-11068981
2 9740 GX237-80-7497206
2 9740 GX257-53-10589749
2 9740 GX258-06-0611419
2 9740 GX268-55-9791226
1 9740 GX007-62-1126118
1 9740 GX015-78-0216468
1 9740 GX038-65-1678199
1 9740 GX041-25-14803324
1 9740 GX063-71-0401425
1 9740 GX077-08-15801730
1 9740 GX098-07-2885671
1 9740 GX135-28-6485892
1 9740 GX228-85-10518518
1 9740 GX231-93-11279468
1 9740 GX234-70-15061254
1 9740 GX236-31-11149347
1 9740 GX240-68-1184464
1 9740 GX248-03-7275316
1 9740 GX253-11-9846012
1 9740 GX255-05-10638500
1 9740 GX267-73-4450097
1 9740 GX269-19-0642640
0 9740 GX001-74-5132048
0 9740 GX001-88-2603815
0 9740 GX004-83-7935833
0 9740 GX007-01-16750210
0 9740 GX040-11-5249209
0 9740 GX042-38-2886005
0 9740 GX052-20-4359789
0 9740 GX067-74-3718011
0 9740 GX077-01-13481396
0 9740 GX242-92-8868913
0 9740 GX262-74-4596688
2 8835 GX010-99-5715419
2 8835 GX049-99-2518724
0 8835 GX000-00-0000000
0 8835 GX007-91-6779497
0 8835 GX008-14-0788708
0 8835 GX008-15-13942125
0 8835 GX011-58-14336551
0 8835 GX012-79-10684001
0 8835 GX013-00-10822427
0 8835 GX013-03-5962783
0 8835 GX015-54-0251701
0 8835 GX017-36-5859317
0 8835 GX017-60-0601078
0 8835 GX027-24-16202205
0 8835 GX030-11-15814183
0 8835 GX030-76-11969233
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册