未验证 提交 2caf374d 编写于 作者: W wuzhihua 提交者: GitHub

Merge pull request #184 from yinhaofeng/multiview-simnet

add model multiview-simnet
...@@ -18,12 +18,12 @@ workspace: "models/match/multiview-simnet" ...@@ -18,12 +18,12 @@ workspace: "models/match/multiview-simnet"
# list of dataset # list of dataset
dataset: dataset:
- name: dataset_train # name of dataset to distinguish different datasets - name: dataset_train # name of dataset to distinguish different datasets
batch_size: 2 batch_size: 128
type: DataLoader # or QueueDataset type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train" data_path: "{workspace}/data/train"
sparse_slots: "1 2 3" sparse_slots: "0 1 2"
- name: dataset_infer # name - name: dataset_infer # name
batch_size: 2 batch_size: 1
type: DataLoader # or QueueDataset type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test" data_path: "{workspace}/data/test"
sparse_slots: "1 2" sparse_slots: "1 2"
...@@ -34,17 +34,17 @@ hyper_parameters: ...@@ -34,17 +34,17 @@ hyper_parameters:
class: Adam class: Adam
learning_rate: 0.0001 learning_rate: 0.0001
strategy: async strategy: async
query_encoder: "bow" query_encoder: "gru"
title_encoder: "bow" title_encoder: "gru"
query_encode_dim: 128 query_encode_dim: 128
title_encode_dim: 128 title_encode_dim: 128
sparse_feature_dim: 1000001 sparse_feature_dim: 1439
embedding_dim: 128 embedding_dim: 128
hidden_size: 128 hidden_size: 128
margin: 0.1 margin: 0.1
# select runner by name # select runner by name
mode: train_runner mode: [train_runner,infer_runner]
# config of each runner. # config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process. # runner is a kind of paddle training class, which wraps the train/infer process.
runner: runner:
...@@ -62,12 +62,14 @@ runner: ...@@ -62,12 +62,14 @@ runner:
save_inference_fetch_varnames: [] # fetch vars of save inference save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path init_model_path: "" # load model path
print_interval: 1 print_interval: 1
phases: phase1
- name: infer_runner - name: infer_runner
class: infer class: infer
# device to run training or infer # device to run training or infer
device: cpu device: cpu
print_interval: 1 print_interval: 1
init_model_path: "increment/0" # load model path init_model_path: "increment/1" # load model path
phases: phase2
# runner will run all the phase in each epoch # runner will run all the phase in each epoch
phase: phase:
...@@ -75,7 +77,7 @@ phase: ...@@ -75,7 +77,7 @@ phase:
model: "{workspace}/model.py" # user-defined model model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name dataset_name: dataset_train # select dataset by name
thread_num: 1 thread_num: 1
#- name: phase2 - name: phase2
# model: "{workspace}/model.py" # user-defined model model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name dataset_name: dataset_infer # select dataset by name
# thread_num: 1 thread_num: 1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#encoding=utf-8
import os
import sys
import numpy as np
import random
f = open("./zhidao", "r")
lines = f.readlines()
f.close()
#建立字典
word_dict = {}
for line in lines:
line = line.strip().split("\t")
text = line[0].split(" ") + line[1].split(" ")
for word in text:
if word in word_dict:
continue
else:
word_dict[word] = len(word_dict) + 1
f = open("./zhidao", "r")
lines = f.readlines()
f.close()
lines = [line.strip().split("\t") for line in lines]
#建立以query为key,以负例为value的字典
neg_dict = {}
for line in lines:
if line[2] == "0":
if line[0] in neg_dict:
neg_dict[line[0]].append(line[1])
else:
neg_dict[line[0]] = [line[1]]
#建立以query为key,以正例为value的字典
pos_dict = {}
for line in lines:
if line[2] == "1":
if line[0] in pos_dict:
pos_dict[line[0]].append(line[1])
else:
pos_dict[line[0]] = [line[1]]
#划分训练集和测试集
query_list = list(pos_dict.keys())
#print(len(query_list))
random.shuffle(query_list)
train_query = query_list[:90]
test_query = query_list[90:]
#获得训练集
train_set = []
for query in train_query:
for pos in pos_dict[query]:
if query not in neg_dict:
continue
for neg in neg_dict[query]:
train_set.append([query, pos, neg])
random.shuffle(train_set)
#获得测试集
test_set = []
for query in test_query:
for pos in pos_dict[query]:
test_set.append([query, pos, 1])
if query not in neg_dict:
continue
for neg in neg_dict[query]:
test_set.append([query, neg, 0])
random.shuffle(test_set)
#训练集中的query,pos,neg转化格式
f = open("train.txt", "w")
for line in train_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
neg = line[2].strip().split(" ")
query_list = []
for word in query:
query_list.append(word_dict[word])
pos_list = []
for word in pos:
pos_list.append(word_dict[word])
neg_list = []
for word in neg:
neg_list.append(word_dict[word])
f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join([
"1:" + str(x) for x in pos_list
]) + " " + ' '.join(["2:" + str(x) for x in neg_list]) + "\n")
f.close()
#测试集中的query和pos转化格式
f = open("test.txt", "w")
fa = open("label.txt", "w")
fb = open("testquery.txt", "w")
for line in test_set:
query = line[0].strip().split(" ")
pos = line[1].strip().split(" ")
label = line[2]
query_list = []
for word in query:
query_list.append(word_dict[word])
pos_list = []
for word in pos:
pos_list.append(word_dict[word])
f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join(
["1:" + str(x) for x in pos_list]) + "\n")
fa.write(str(label) + "\n")
fb.write(','.join([str(x) for x in query_list]) + "\n")
f.close()
fa.close()
fb.close()
224289:0 126379:0 284519:0 549329:0 750666:0 393772:0 586898:0 736887:0 48785:0 906517:0 229162:1 483485:1 739835:1 29957:1 694497:1 997508:1 556876:1 717791:1 232176:1 430356:1 0:908 0:159 0:909 0:910 0:109 1:911 1:159 1:909 1:910 1:109
366182:0 82062:0 708883:0 949128:0 798964:0 639103:0 409033:0 79301:0 405607:0 342616:0 61552:1 560547:1 3760:1 754734:1 98496:1 472427:1 979596:1 750283:1 492028:1 801383:1 0:210 0:10 0:211 0:14 0:212 1:211 1:210 1:32 1:148 1:212 1:48 1:65 1:65 1:211 1:210 1:33 1:213 1:214 1:48
969571:0 405187:0 756217:0 563640:0 572168:0 881952:0 446260:0 692177:0 994140:0 485393:0 509081:1 297377:1 465399:1 934708:1 430949:1 135651:1 484531:1 385306:1 463957:1 996004:1 0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:79 1:80 1:81 1:13 1:78 1:1 1:692 1:144 1:85 1:48
436320:0 423131:0 963969:0 78345:0 879550:0 458203:0 684397:0 956202:0 989802:0 526101:0 852446:1 182545:1 625656:1 674856:1 422648:1 74100:1 48372:1 850830:1 336087:1 178251:1 0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:75 1:83 1:78 1:86 1:270 1:85 1:48
242683:0 118677:0 20731:0 970617:0 355890:0 739613:0 926695:0 963639:0 201043:0 611907:0 115309:1 310984:1 615584:1 638886:1 575934:1 889389:1 974807:1 570987:1 532482:1 911925:1 0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:78 1:79 1:80 1:235 1:144 1:236 1:169 1:237 1:138 1:48 1:22
954007:0 122623:0 168195:0 348901:0 217880:0 84759:0 925763:0 436382:0 573742:0 942921:0 553377:1 835046:1 137907:1 933870:1 766585:1 48483:1 543079:1 889467:1 521705:1 906676:1 0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:227 1:228 1:13 1:75 1:229 1:80 1:81 1:4 1:78 1:14 1:39
798690:0 617323:0 553266:0 232924:0 159461:0 404822:0 52992:0 364854:0 913876:0 547974:0 559472:1 748595:1 71793:1 357331:1 606888:1 477051:1 291481:1 89363:1 503881:1 423029:1 0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:21 1:22
228207:0 785250:0 661149:0 803304:0 478781:0 495202:0 804509:0 273065:0 26123:0 810840:0 801871:1 146772:1 421009:1 752344:1 946358:1 531668:1 5771:1 191294:1 627329:1 434664:1 0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:176 1:113 1:68 1:1357
984628:0 762075:0 505288:0 48519:0 72492:0 26568:0 684085:0 613095:0 781547:0 895829:0 280541:1 903234:1 708065:1 386658:1 331060:1 3693:1 279760:1 459579:1 423552:1 962594:1 0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1105 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:16 1:731 1:22
674172:0 39271:0 646093:0 757969:0 553251:0 734960:0 967186:0 856940:0 617246:0 376452:0 113050:1 472707:1 975057:1 865095:1 155824:1 389921:1 205520:1 513667:1 163588:1 953463:1 0:155 0:837 0:838 0:839 1:155 1:838 1:1296
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:21
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:83 1:100 1:79 1:81 1:4 1:86 1:82 1:94 1:84 1:85 1:48 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:113 1:68 1:114 1:21 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:167 1:168 1:13 1:80 1:81 1:144 1:82 1:169 1:170 1:171 1:172 1:148 1:173 1:174
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:449 1:450 1:80 1:10 1:451 1:13 1:452 1:453 1:6 1:85 1:168 1:81 1:4 1:78 1:22 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:230 1:113 1:68 1:114 1:13 1:144 1:113 1:68 1:114
0:155 0:837 0:838 0:839 1:1371 1:155 1:578 1:838 1:21 1:839 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:86 1:172 1:83 1:170 1:138 1:48
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:698 1:67 1:96 1:696
0:222 0:223 0:224 0:225 0:67 0:96 1:624 1:1238 1:222 1:223 1:224 1:67 1:96
0:210 0:10 0:211 0:14 0:212 1:211 1:614 1:214 1:86 1:82 1:48 1:65 1:65 1:155 1:212
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:1406 1:1407
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:1222 1:116 1:113 1:68 1:22
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:695 1:96 1:696 1:1128
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:233 1:1350 1:4 1:1074 1:113 1:68 1:21 1:70 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:223 1:224 1:419 1:96 1:1054 1:1055
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:167 1:79 1:80 1:81 1:86 1:82 1:83 1:84 1:138 1:48 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:223 1:224 1:67 1:96
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:226 1:223 1:224 1:67 1:96
0:210 0:10 0:211 0:14 0:212 1:210 1:211 1:32 1:4 1:474 1:637
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:1 1:1211 1:178 1:78 1:13 1:79 1:80 1:81 1:14 1:85 1:22
0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:920 1:456 1:153 1:152 1:14 1:921
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:144 1:113 1:68 1:115
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1106 1:78 1:1107 1:13 1:170 1:1108 1:13 1:1109 1:75 1:79 1:80 1:81 1:13 1:177 1:85 1:577 1:78 1:32 1:170 1:86 1:82 1:48 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:135 1:78 1:91 1:79 1:78 1:136 1:81 1:4 1:137 1:86 1:82 1:83 1:84 1:138 1:48
0:421 0:456 0:153 0:152 0:159 0:457 1:153 1:421 1:456 1:152 1:475 1:68 1:476
0:155 0:837 0:838 0:839 1:155 1:838 1:839
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:576 1:168 1:80 1:81 1:13 1:86 1:80 1:83 1:170 1:48 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1070 1:78 1:33 1:67 1:79 1:121 1:80 1:81 1:276 1:162 1:1071 1:1072 1:103 1:13 1:167 1:1073 1:164 1:86 1:8 1:83 1:170 1:6 1:138 1:48 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:421 1:936 1:223 1:4 1:937 1:224 1:67 1:96 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:144 1:113 1:68 1:1155 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:80 1:81 1:13 1:86 1:82 1:1280 1:4 1:170 1:138 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:4 1:144 1:8 1:169 1:84 1:171 1:172 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:170 1:65 1:65 1:168 1:138 1:80 1:1212 1:81 1:65 1:65 1:13 1:65 1:65 1:452 1:172 1:538 1:6 1:80 1:173
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1105 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:13 1:75 1:68 1:734 1:48 1:22 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1070 1:78 1:618 1:81 1:14 1:39 1:86 1:82 1:83 1:170 1:138 1:48
0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1027 1:75 1:480 1:1029 1:4 1:1031 1:65 1:65 1:1032 1:1033 1:1034 1:1029 1:1031 1:1250
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:747 1:748 1:729 1:75 1:68 1:730 1:16 1:734
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:65 1:65 1:87 1:82 1:83 1:84 1:80
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:86 1:1039 1:85 1:168 1:81 1:4 1:78 1:48 1:22
0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1032 1:1033 1:4 1:1034 1:1031 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:94 1:84 1:79 1:85 1:617 1:4 1:78 1:13 1:87 1:618 1:81
0:908 0:159 0:909 0:910 0:109 1:911 1:14 1:922 1:910 1:109 1:877
0:1335 0:409 0:1336 0:10 1:1335 1:409 1:1336 1:10
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:4 1:111 1:112 1:113 1:68 1:1074
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1137 1:100 1:895 1:80 1:81 1:13 1:86 1:82 1:83 1:84 1:6 1:138 1:48 1:22
0:908 0:159 0:909 0:910 0:109 1:908 1:14 1:1311 1:910 1:109 1:877
0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:153 1:456 1:152 1:14 1:457
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:233 1:234 1:4 1:111 1:112 1:113 1:68 1:114 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1186 1:78 1:13 1:79 1:81 1:79 1:1187 1:86 1:82 1:83 1:84 1:6 1:80 1:48 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:13 1:113 1:68 1:115 1:769 1:548 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:168 1:80 1:81 1:65 1:65 1:86 1:8 1:83 1:84 1:80 1:48
0:210 0:10 0:211 0:14 0:212 1:211 1:427 1:32 1:614 1:212 1:14 1:39
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:4 1:111 1:112 1:113 1:68 1:114
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:86 1:113 1:480 1:1283 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:958 1:170 1:450 1:121 1:80 1:10 1:1428 1:13 1:1429 1:85 1:79 1:81 1:4 1:78 1:13 1:33 1:1251 1:4 1:160 1:137
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:28 1:170 1:439 1:1165 1:1166 1:13 1:133 1:85 1:94 1:168 1:80 1:81 1:4 1:78 1:48 1:22 1:22
0:222 0:223 0:224 0:225 0:67 0:96 1:421 1:422 1:223 1:224 1:67 1:96
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:112 1:113 1:68 1:22 1:148 1:112 1:113 1:68 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:587 1:170 1:80 1:10 1:774 1:10 1:13 1:57 1:51 1:86 1:85 1:94 1:168 1:81 1:4 1:78 1:22
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:951 1:442 1:4 1:111 1:13 1:112 1:113 1:480 1:114 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:772 1:84 1:144 1:85 1:168 1:80 1:81 1:4 1:78 1:48 1:22
0:210 0:10 0:211 0:14 0:212 1:210 1:10 1:211 1:14 1:212
0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:1378 1:223 1:224 1:67 1:96
0:155 0:837 0:838 0:839 1:49 1:14 1:838 1:839
0:210 0:10 0:211 0:14 0:212 1:148 1:472 1:473 1:211 1:13 1:210 1:32 1:155 1:474
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:28 1:78 1:80 1:81 1:137 1:1112 1:84 1:450 1:1113 1:81 1:137 1:86 1:85 1:81 1:4 1:78 1:48 1:22
0:908 0:159 0:909 0:910 0:109 1:911 1:912 1:909 1:910 1:109
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:807 1:78 1:169 1:81 1:94 1:170 1:144 1:80 1:48 1:22
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:732 1:729 1:75 1:68 1:730 1:16 1:734 1:22
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:104 1:421 1:86 1:695 1:96 1:696 1:9
0:155 0:837 0:838 0:839 1:1052 1:205 1:155 1:838 1:839 1:70
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:16 1:734 1:22
0:210 0:10 0:211 0:14 0:212 1:211 1:65 1:65 1:14 1:212 1:65 1:65 1:14 1:1349
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:882 1:113 1:68 1:21
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1079 1:732 1:729 1:75 1:68 1:730 1:16 1:734 1:22
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:958 1:84 1:959 1:80 1:577 1:14 1:39 1:13 1:79 1:78 1:80 1:81 1:86 1:82 1:169 1:84 1:960 1:48
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:1074 1:113 1:68
0:210 0:10 0:211 0:14 0:212 1:211 1:210 1:10 1:14 1:212 1:211
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:116 1:113 1:68 1:800 1:173
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:168 1:80 1:81 1:4 1:78 1:13 1:423 1:424 1:235 1:4 1:84 1:138 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:121 1:80 1:81 1:86 1:82 1:83 1:170 1:138 1:48 1:22
0:421 0:456 0:153 0:152 0:159 0:457 1:222 1:39 1:456 1:153 1:152 1:475 1:495 1:737 1:1076 1:102 1:1077 1:1078
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:112 1:113 1:68 1:114
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:115 1:116 1:22
0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:86 1:695 1:96 1:696 1:9 1:65 1:65 1:104 1:86 1:695 1:96 1:696 1:1128
0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:113 1:68 1:114 1:86 1:75 1:110
0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:1227 1:456 1:152 1:14 1:457
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:86 1:82 1:118 1:170 1:138 1:48
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:80 1:81 1:86 1:8 1:1087 1:84 1:80 1:48
0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1391 1:1392 1:13 1:1393 1:1032 1:1033 1:189 1:4 1:629 1:1034 1:1031 1:48
0:908 0:159 0:909 0:910 0:109 1:908 1:30 1:155 1:922 1:910 1:109 1:877 1:22
0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:729 1:732 1:733 1:10 1:120 1:75 1:68 1:730 1:16 1:734
0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:94 1:78 1:80 1:81 1:65 1:65 1:58 1:94 1:84 1:85 1:206 1:14 1:85 1:22
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
...@@ -40,8 +39,8 @@ class Reader(ReaderBase): ...@@ -40,8 +39,8 @@ class Reader(ReaderBase):
padding = 0 padding = 0
output = [(slot, []) for slot in self.all_slots] output = [(slot, []) for slot in self.all_slots]
for elem in elements: for elem in elements:
feasign, slot = elem.split(':') slot, feasign = elem.split(':')
if not self._all_slots_dict.has_key(slot): if slot not in self._all_slots_dict:
continue continue
self._all_slots_dict[slot][0] = True self._all_slots_dict[slot][0] = True
index = self._all_slots_dict[slot][1] index = self._all_slots_dict[slot][1]
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
class Dataset:
def __init__(self):
pass
class SyntheticDataset(Dataset):
def __init__(self,
sparse_feature_dim,
query_slot_num,
title_slot_num,
dataset_size=10000):
# ids are randomly generated
self.ids_per_slot = 10
self.sparse_feature_dim = sparse_feature_dim
self.query_slot_num = query_slot_num
self.title_slot_num = title_slot_num
self.dataset_size = dataset_size
def _reader_creator(self, is_train):
def generate_ids(num, space):
return [random.randint(0, space - 1) for i in range(num)]
def reader():
for i in range(self.dataset_size):
query_slots = []
pos_title_slots = []
neg_title_slots = []
for i in range(self.query_slot_num):
qslot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
qslot = [str(fea) + ':' + str(i) for fea in qslot]
query_slots += qslot
for i in range(self.title_slot_num):
pt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
pt_slot = [
str(fea) + ':' + str(i + self.query_slot_num)
for fea in pt_slot
]
pos_title_slots += pt_slot
if is_train:
for i in range(self.title_slot_num):
nt_slot = generate_ids(self.ids_per_slot,
self.sparse_feature_dim)
nt_slot = [
str(fea) + ':' +
str(i + self.query_slot_num + self.title_slot_num)
for fea in nt_slot
]
neg_title_slots += nt_slot
yield query_slots + pos_title_slots + neg_title_slots
else:
yield query_slots + pos_title_slots
return reader
def train(self):
return self._reader_creator(True)
def valid(self):
return self._reader_creator(True)
def test(self):
return self._reader_creator(False)
if __name__ == '__main__':
sparse_feature_dim = 1000001
query_slots = 1
title_slots = 1
dataset_size = 10
dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
dataset_size)
train_reader = dataset.train()
test_reader = dataset.test()
with open("data/train/train.txt", 'w') as fout:
for data in train_reader():
fout.write(' '.join(data))
fout.write("\n")
with open("data/test/test.txt", 'w') as fout:
for data in test_reader():
fout.write(' '.join(data))
fout.write("\n")
...@@ -43,8 +43,8 @@ class Reader(ReaderBase): ...@@ -43,8 +43,8 @@ class Reader(ReaderBase):
padding = 0 padding = 0
output = [(slot, []) for slot in self.all_slots] output = [(slot, []) for slot in self.all_slots]
for elem in elements: for elem in elements:
feasign, slot = elem.split(':') slot, feasign = elem.split(':')
if not self._all_slots_dict.has_key(slot): if slot not in self._all_slots_dict:
continue continue
self._all_slots_dict[slot][0] = True self._all_slots_dict[slot][0] = True
index = self._all_slots_dict[slot][1] index = self._all_slots_dict[slot][1]
......
# multiview-simnet文本匹配模型
以下是本例的简要目录结构及说明:
```
├── data #样例数据
├── train
├── train.txt #训练数据样例
├── test
├── test.txt #测试数据样例
├── preprocess.py #数据处理程序
├── __init__.py
├── README.md #文档
├── model.py #模型文件
├── config.yaml #配置文件
├── run.sh #运行脚本,在效果复现时使用
├── transform.py #整理格式准备计算指标的程序
├── reader.py #读者需要自定义数据集时供读者参考
├── evaluate_reader.py #读者需要自定义数据集时供读者参考
```
注:在阅读该示例前,建议您先了解以下内容:
[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [运行环境](#运行环境)
- [快速开始](#快速开始)
- [效果复现](#效果复现)
- [进阶使用](#进阶使用)
- [FAQ](#FAQ)
## 模型简介
在个性化推荐场景中,推荐系统给用户提供的项目(Item)列表通常是通过个性化的匹配模型计算出来的。在现实世界中,一个用户可能有很多个视角的特征,比如用户Id,年龄,项目的点击历史等。一个项目,举例来说,新闻资讯,也会有多种视角的特征比如新闻标题,新闻类别等。多视角Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。 多视角Simnet模型包括多个编码器模块,每个编码器被用在不同的特征视角上。当前,项目中提供Bag-of-Embedding编码器,Temporal-Convolutional编码器,和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法,当前采用的是Pairwise ranking模式进行训练,即针对一对具有关联的User-Item组合,随机实用一个Item作为负例进行排序学习。
模型的具体细节可以阅读论文[MultiView-Simnet](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp1159-songA.pdf):
<p align="center">
<img align="center" src="../../../doc/imgs/multiview-simnet.png">
<p>
## 数据准备
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
```
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm simnet_dataset-1.0.0.tar.gz
```
数据格式为一个标识句子的slot,后跟一个句子中词的token。两者形成{slot:token}的形式标识一个词:
```
0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:1144 1:217 1:206 1:9 1:3 1:207 1:10 1:398 1:2 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:951 1:952 1:206 1:9 1:3 1:207 1:10 1:398 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
```
## 运行环境
PaddlePaddle>=1.7.2
python 2.7
PaddleRec >=0.1
os : linux
## 快速开始
本文提供了样例数据可以供您快速体验,在paddlerec目录下直接执行下面的命令即可启动训练:
```
python -m paddlerec.run -m models/match/multiview-simnet/config.yaml
```
## 效果复现
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
1. 确认您当前所在目录为PaddleRec/models/match/multiview-simnet
2. 在data目录下载并解压数据集,命令如下:
```
cd data
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
tar xzf simnet_dataset-1.0.0.tar.gz
rm -f simnet_dataset-1.0.0.tar.gz
mv data/zhidao ./
rm -rf data
```
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt,label.txt和testquery.txt。将其放入train和test目录下以备训练时调用。命令如下:
```
python3 preprocess.py
rm -f ./train/train.txt
mv train.txt ./train
rm -f ./test/test.txt
mv test.txt ./test
cd ..
```
4. 退回tagspace目录中,打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动格式整理程序transform,最后计算正逆序比:
```
sh run.sh
```
运行结果大致如下:
```
................run.................
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 14:24:57.255358 7888 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 14:24:57.259166 7888 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 14:24:57.262634 7888 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 14:24:57.264791 7888 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
103
pnr: 1.17674418605
query_num: 11
pair_num: 468 468
equal_num: 0
正序率: 0.540598290598
253 215
```
6. 提醒:因为采取较小的数据集进行训练和测试,得到指标的浮动程度会比较大。如果得到的指标不合预期,可以多次执行步骤5,即可获得合理的指标。
## 进阶使用
## FAQ
#! /bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -14,11 +12,14 @@ ...@@ -14,11 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#!/bin/bash
set -e echo "................run................."
echo "begin to prepare data" python -m paddlerec.run -m ./config.yaml >result1.txt
grep -i "query_pt_sim" ./result1.txt >./result2.txt
mkdir -p data/train sed '$d' result2.txt >result.txt
mkdir -p data/test rm -f result1.txt
rm -f result2.txt
python generate_synthetic_data.py python transform.py
sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
rm -f pair.txt
python ../../../tools/cal_pos_neg.py result.txt
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import numpy as np
label = []
filename = './data/label.txt'
f = open(filename, "r")
f.readline()
num = 0
for line in f.readlines():
num = num + 1
line = line.strip()
label.append(line)
f.close()
print(num)
filename = './result.txt'
sim = []
for line in open(filename):
line = line.strip().split(",")
line[1] = line[1].split(":")
line = line[1][1].strip(" ")
line = line.strip("[")
line = line.strip("]")
sim.append(float(line))
filename = './data/testquery.txt'
f = open(filename, "r")
f.readline()
query = []
for line in f.readlines():
line = line.strip()
query.append(line)
f.close()
filename = 'pair.txt'
f = open(filename, "w")
for i in range(len(sim)):
f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
f.close()
...@@ -11,14 +11,8 @@ ...@@ -11,14 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#!/usr/bin/python #!/usr/bin/python
#-*- coding:utf-8 -*- #-*- coding:utf-8 -*-
############################
#File Name: cal_pos_neg.py
#Author: youqiheng
#Mail: youqiheng@baidu.com
#Created Time: 2018-04-15 21:59:45
############################
""" """
docstring docstring
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册