Merge pull request #184 from yinhaofeng/multiview-simnet

add model multiview-simnet

Merge pull request #184 from yinhaofeng/multiview-simnet
add model multiview-simnet
2caf374d · wuzhihua · GitHub · 03cec6d1 · 1cded004 · 2caf374d
11 changed file
--- a/models/match/multiview-simnet/config.yaml
+++ b/models/match/multiview-simnet/config.yaml
@@ -18,12 +18,12 @@ workspace: "models/match/multiview-simnet"
 # list of dataset
 dataset:
 - name: dataset_train # name of dataset to distinguish different datasets
-  batch_size: 2
+  batch_size: 128
  type: DataLoader # or QueueDataset
  data_path: "{workspace}/data/train"
-  sparse_slots: "1 2 3"
+  sparse_slots: "0 1 2"
 - name: dataset_infer # name
-  batch_size: 2
+  batch_size: 1
  type: DataLoader # or QueueDataset
  data_path: "{workspace}/data/test"
  sparse_slots: "1 2"
@@ -34,17 +34,17 @@ hyper_parameters:
    class: Adam
    learning_rate: 0.0001
    strategy: async
-  query_encoder: "bow"
-  title_encoder: "bow"
+  query_encoder: "gru"
+  title_encoder: "gru"
  query_encode_dim: 128
  title_encode_dim: 128
-  sparse_feature_dim: 1000001
+  sparse_feature_dim: 1439
  embedding_dim: 128
  hidden_size: 128
  margin: 0.1

 # select runner by name
-mode: train_runner
+mode: [train_runner,infer_runner]
 # config of each runner.
 # runner is a kind of paddle training class, which wraps the train/infer process.
 runner:
@@ -62,12 +62,14 @@ runner:
  save_inference_fetch_varnames: [] # fetch vars of save inference
  init_model_path: "" # load model path
  print_interval: 1
+  phases: phase1
 - name: infer_runner
  class: infer
  # device to run training or infer
  device: cpu
  print_interval: 1
-  init_model_path: "increment/0" # load model path
+  init_model_path: "increment/1" # load model path
+  phases: phase2

 # runner will run all the phase in each epoch
 phase:
@@ -75,7 +77,7 @@ phase:
  model: "{workspace}/model.py" # user-defined model
  dataset_name: dataset_train # select dataset by name
  thread_num: 1
-#- name: phase2
-#  model: "{workspace}/model.py" # user-defined model
-#  dataset_name: dataset_infer # select dataset by name
-#  thread_num: 1
+- name: phase2
+  model: "{workspace}/model.py" # user-defined model
+  dataset_name: dataset_infer # select dataset by name
+  thread_num: 1
--- a/models/match/multiview-simnet/data/preprocess.py
+++ b/models/match/multiview-simnet/data/preprocess.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#encoding=utf-8
+
+import os
+import sys
+import numpy as np
+import random
+
+f = open("./zhidao", "r")
+lines = f.readlines()
+f.close()
+
+#建立字典
+word_dict = {}
+for line in lines:
+    line = line.strip().split("\t")
+    text = line[0].split(" ") + line[1].split(" ")
+    for word in text:
+        if word in word_dict:
+            continue
+        else:
+            word_dict[word] = len(word_dict) + 1
+
+f = open("./zhidao", "r")
+lines = f.readlines()
+f.close()
+
+lines = [line.strip().split("\t") for line in lines]
+
+#建立以query为key，以负例为value的字典
+neg_dict = {}
+for line in lines:
+    if line[2] == "0":
+        if line[0] in neg_dict:
+            neg_dict[line[0]].append(line[1])
+        else:
+            neg_dict[line[0]] = [line[1]]
+
+#建立以query为key，以正例为value的字典
+pos_dict = {}
+for line in lines:
+    if line[2] == "1":
+        if line[0] in pos_dict:
+            pos_dict[line[0]].append(line[1])
+        else:
+            pos_dict[line[0]] = [line[1]]
+
+#划分训练集和测试集
+query_list = list(pos_dict.keys())
+#print(len(query_list))
+random.shuffle(query_list)
+train_query = query_list[:90]
+test_query = query_list[90:]
+
+#获得训练集
+train_set = []
+for query in train_query:
+    for pos in pos_dict[query]:
+        if query not in neg_dict:
+            continue
+        for neg in neg_dict[query]:
+            train_set.append([query, pos, neg])
+random.shuffle(train_set)
+
+#获得测试集
+test_set = []
+for query in test_query:
+    for pos in pos_dict[query]:
+        test_set.append([query, pos, 1])
+    if query not in neg_dict:
+        continue
+    for neg in neg_dict[query]:
+        test_set.append([query, neg, 0])
+random.shuffle(test_set)
+
+#训练集中的query,pos,neg转化格式
+f = open("train.txt", "w")
+for line in train_set:
+    query = line[0].strip().split(" ")
+    pos = line[1].strip().split(" ")
+    neg = line[2].strip().split(" ")
+    query_list = []
+    for word in query:
+        query_list.append(word_dict[word])
+    pos_list = []
+    for word in pos:
+        pos_list.append(word_dict[word])
+    neg_list = []
+    for word in neg:
+        neg_list.append(word_dict[word])
+    f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join([
+        "1:" + str(x) for x in pos_list
+    ]) + " " + ' '.join(["2:" + str(x) for x in neg_list]) + "\n")
+f.close()
+
+#测试集中的query和pos转化格式
+f = open("test.txt", "w")
+fa = open("label.txt", "w")
+fb = open("testquery.txt", "w")
+for line in test_set:
+    query = line[0].strip().split(" ")
+    pos = line[1].strip().split(" ")
+    label = line[2]
+    query_list = []
+    for word in query:
+        query_list.append(word_dict[word])
+    pos_list = []
+    for word in pos:
+        pos_list.append(word_dict[word])
+    f.write(' '.join(["0:" + str(x) for x in query_list]) + " " + ' '.join(
+        ["1:" + str(x) for x in pos_list]) + "\n")
+    fa.write(str(label) + "\n")
+    fb.write(','.join([str(x) for x in query_list]) + "\n")
+f.close()
+fa.close()
+fb.close()
--- a/models/match/multiview-simnet/data/test/test.txt
+++ b/models/match/multiview-simnet/data/test/test.txt
-224289:0 126379:0 284519:0 549329:0 750666:0 393772:0 586898:0 736887:0 48785:0 906517:0 229162:1 483485:1 739835:1 29957:1 694497:1 997508:1 556876:1 717791:1 232176:1 430356:1
-366182:0 82062:0 708883:0 949128:0 798964:0 639103:0 409033:0 79301:0 405607:0 342616:0 61552:1 560547:1 3760:1 754734:1 98496:1 472427:1 979596:1 750283:1 492028:1 801383:1
-969571:0 405187:0 756217:0 563640:0 572168:0 881952:0 446260:0 692177:0 994140:0 485393:0 509081:1 297377:1 465399:1 934708:1 430949:1 135651:1 484531:1 385306:1 463957:1 996004:1
-436320:0 423131:0 963969:0 78345:0 879550:0 458203:0 684397:0 956202:0 989802:0 526101:0 852446:1 182545:1 625656:1 674856:1 422648:1 74100:1 48372:1 850830:1 336087:1 178251:1
-242683:0 118677:0 20731:0 970617:0 355890:0 739613:0 926695:0 963639:0 201043:0 611907:0 115309:1 310984:1 615584:1 638886:1 575934:1 889389:1 974807:1 570987:1 532482:1 911925:1
-954007:0 122623:0 168195:0 348901:0 217880:0 84759:0 925763:0 436382:0 573742:0 942921:0 553377:1 835046:1 137907:1 933870:1 766585:1 48483:1 543079:1 889467:1 521705:1 906676:1
-798690:0 617323:0 553266:0 232924:0 159461:0 404822:0 52992:0 364854:0 913876:0 547974:0 559472:1 748595:1 71793:1 357331:1 606888:1 477051:1 291481:1 89363:1 503881:1 423029:1
-228207:0 785250:0 661149:0 803304:0 478781:0 495202:0 804509:0 273065:0 26123:0 810840:0 801871:1 146772:1 421009:1 752344:1 946358:1 531668:1 5771:1 191294:1 627329:1 434664:1
-984628:0 762075:0 505288:0 48519:0 72492:0 26568:0 684085:0 613095:0 781547:0 895829:0 280541:1 903234:1 708065:1 386658:1 331060:1 3693:1 279760:1 459579:1 423552:1 962594:1
-674172:0 39271:0 646093:0 757969:0 553251:0 734960:0 967186:0 856940:0 617246:0 376452:0 113050:1 472707:1 975057:1 865095:1 155824:1 389921:1 205520:1 513667:1 163588:1 953463:1
+0:908 0:159 0:909 0:910 0:109 1:911 1:159 1:909 1:910 1:109
+0:210 0:10 0:211 0:14 0:212 1:211 1:210 1:32 1:148 1:212 1:48 1:65 1:65 1:211 1:210 1:33 1:213 1:214 1:48
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:79 1:80 1:81 1:13 1:78 1:1 1:692 1:144 1:85 1:48
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:75 1:83 1:78 1:86 1:270 1:85 1:48
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:78 1:79 1:80 1:235 1:144 1:236 1:169 1:237 1:138 1:48 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:227 1:228 1:13 1:75 1:229 1:80 1:81 1:4 1:78 1:14 1:39
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:21 1:22
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:176 1:113 1:68 1:1357
+0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1105 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:16 1:731 1:22
+0:155 0:837 0:838 0:839 1:155 1:838 1:1296
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:21
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:83 1:100 1:79 1:81 1:4 1:86 1:82 1:94 1:84 1:85 1:48 1:22
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:113 1:68 1:114 1:21 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:167 1:168 1:13 1:80 1:81 1:144 1:82 1:169 1:170 1:171 1:172 1:148 1:173 1:174
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:449 1:450 1:80 1:10 1:451 1:13 1:452 1:453 1:6 1:85 1:168 1:81 1:4 1:78 1:22 1:22
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:230 1:113 1:68 1:114 1:13 1:144 1:113 1:68 1:114
+0:155 0:837 0:838 0:839 1:1371 1:155 1:578 1:838 1:21 1:839 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:86 1:172 1:83 1:170 1:138 1:48
+0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:698 1:67 1:96 1:696
+0:222 0:223 0:224 0:225 0:67 0:96 1:624 1:1238 1:222 1:223 1:224 1:67 1:96
+0:210 0:10 0:211 0:14 0:212 1:211 1:614 1:214 1:86 1:82 1:48 1:65 1:65 1:155 1:212
+0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:1406 1:1407
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:1222 1:116 1:113 1:68 1:22
+0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:695 1:96 1:696 1:1128
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:233 1:1350 1:4 1:1074 1:113 1:68 1:21 1:70 1:22
+0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:223 1:224 1:419 1:96 1:1054 1:1055
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:167 1:79 1:80 1:81 1:86 1:82 1:83 1:84 1:138 1:48 1:22
+0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:223 1:224 1:67 1:96
+0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:226 1:223 1:224 1:67 1:96
+0:210 0:10 0:211 0:14 0:212 1:210 1:211 1:32 1:4 1:474 1:637
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:1 1:1211 1:178 1:78 1:13 1:79 1:80 1:81 1:14 1:85 1:22
+0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:920 1:456 1:153 1:152 1:14 1:921
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:144 1:113 1:68 1:115
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1106 1:78 1:1107 1:13 1:170 1:1108 1:13 1:1109 1:75 1:79 1:80 1:81 1:13 1:177 1:85 1:577 1:78 1:32 1:170 1:86 1:82 1:48 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:135 1:78 1:91 1:79 1:78 1:136 1:81 1:4 1:137 1:86 1:82 1:83 1:84 1:138 1:48
+0:421 0:456 0:153 0:152 0:159 0:457 1:153 1:421 1:456 1:152 1:475 1:68 1:476
+0:155 0:837 0:838 0:839 1:155 1:838 1:839
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:576 1:168 1:80 1:81 1:13 1:86 1:80 1:83 1:170 1:48 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1070 1:78 1:33 1:67 1:79 1:121 1:80 1:81 1:276 1:162 1:1071 1:1072 1:103 1:13 1:167 1:1073 1:164 1:86 1:8 1:83 1:170 1:6 1:138 1:48 1:22
+0:222 0:223 0:224 0:225 0:67 0:96 1:421 1:936 1:223 1:4 1:937 1:224 1:67 1:96 1:22
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:144 1:113 1:68 1:1155 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:80 1:81 1:13 1:86 1:82 1:1280 1:4 1:170 1:138 1:48
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:4 1:144 1:8 1:169 1:84 1:171 1:172 1:48
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:170 1:65 1:65 1:168 1:138 1:80 1:1212 1:81 1:65 1:65 1:13 1:65 1:65 1:452 1:172 1:538 1:6 1:80 1:173
+0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1105 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:13 1:75 1:68 1:734 1:48 1:22 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1070 1:78 1:618 1:81 1:14 1:39 1:86 1:82 1:83 1:170 1:138 1:48
+0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1027 1:75 1:480 1:1029 1:4 1:1031 1:65 1:65 1:1032 1:1033 1:1034 1:1029 1:1031 1:1250
+0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:747 1:748 1:729 1:75 1:68 1:730 1:16 1:734
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:65 1:65 1:87 1:82 1:83 1:84 1:80
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:84 1:86 1:1039 1:85 1:168 1:81 1:4 1:78 1:48 1:22
+0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1032 1:1033 1:4 1:1034 1:1031 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:94 1:84 1:79 1:85 1:617 1:4 1:78 1:13 1:87 1:618 1:81
+0:908 0:159 0:909 0:910 0:109 1:911 1:14 1:922 1:910 1:109 1:877
+0:1335 0:409 0:1336 0:10 1:1335 1:409 1:1336 1:10
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:4 1:111 1:112 1:113 1:68 1:1074
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1137 1:100 1:895 1:80 1:81 1:13 1:86 1:82 1:83 1:84 1:6 1:138 1:48 1:22
+0:908 0:159 0:909 0:910 0:109 1:908 1:14 1:1311 1:910 1:109 1:877
+0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:153 1:456 1:152 1:14 1:457
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:233 1:234 1:4 1:111 1:112 1:113 1:68 1:114 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:1186 1:78 1:13 1:79 1:81 1:79 1:1187 1:86 1:82 1:83 1:84 1:6 1:80 1:48 1:22
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:13 1:113 1:68 1:115 1:769 1:548 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:168 1:80 1:81 1:65 1:65 1:86 1:8 1:83 1:84 1:80 1:48
+0:210 0:10 0:211 0:14 0:212 1:211 1:427 1:32 1:614 1:212 1:14 1:39
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:110 1:4 1:111 1:112 1:113 1:68 1:114
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:86 1:113 1:480 1:1283 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:958 1:170 1:450 1:121 1:80 1:10 1:1428 1:13 1:1429 1:85 1:79 1:81 1:4 1:78 1:13 1:33 1:1251 1:4 1:160 1:137
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:28 1:170 1:439 1:1165 1:1166 1:13 1:133 1:85 1:94 1:168 1:80 1:81 1:4 1:78 1:48 1:22 1:22
+0:222 0:223 0:224 0:225 0:67 0:96 1:421 1:422 1:223 1:224 1:67 1:96
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:112 1:113 1:68 1:22 1:148 1:112 1:113 1:68 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:587 1:170 1:80 1:10 1:774 1:10 1:13 1:57 1:51 1:86 1:85 1:94 1:168 1:81 1:4 1:78 1:22
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:951 1:442 1:4 1:111 1:13 1:112 1:113 1:480 1:114 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:772 1:84 1:144 1:85 1:168 1:80 1:81 1:4 1:78 1:48 1:22
+0:210 0:10 0:211 0:14 0:212 1:210 1:10 1:211 1:14 1:212
+0:222 0:223 0:224 0:225 0:67 0:96 1:222 1:1378 1:223 1:224 1:67 1:96
+0:155 0:837 0:838 0:839 1:49 1:14 1:838 1:839
+0:210 0:10 0:211 0:14 0:212 1:148 1:472 1:473 1:211 1:13 1:210 1:32 1:155 1:474
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:28 1:78 1:80 1:81 1:137 1:1112 1:84 1:450 1:1113 1:81 1:137 1:86 1:85 1:81 1:4 1:78 1:48 1:22
+0:908 0:159 0:909 0:910 0:109 1:911 1:912 1:909 1:910 1:109
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:807 1:78 1:169 1:81 1:94 1:170 1:144 1:80 1:48 1:22
+0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:732 1:729 1:75 1:68 1:730 1:16 1:734 1:22
+0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:104 1:421 1:86 1:695 1:96 1:696 1:9
+0:155 0:837 0:838 0:839 1:1052 1:205 1:155 1:838 1:839 1:70
+0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:732 1:729 1:509 1:510 1:75 1:68 1:730 1:16 1:734 1:22
+0:210 0:10 0:211 0:14 0:212 1:211 1:65 1:65 1:14 1:212 1:65 1:65 1:14 1:1349
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:882 1:113 1:68 1:21
+0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:1079 1:732 1:729 1:75 1:68 1:730 1:16 1:734 1:22
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:958 1:84 1:959 1:80 1:577 1:14 1:39 1:13 1:79 1:78 1:80 1:81 1:86 1:82 1:169 1:84 1:960 1:48
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:1074 1:113 1:68
+0:210 0:10 0:211 0:14 0:212 1:211 1:210 1:10 1:14 1:212 1:211
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:116 1:113 1:68 1:800 1:173
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:168 1:80 1:81 1:4 1:78 1:13 1:423 1:424 1:235 1:4 1:84 1:138 1:48
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:121 1:80 1:81 1:86 1:82 1:83 1:170 1:138 1:48 1:22
+0:421 0:456 0:153 0:152 0:159 0:457 1:222 1:39 1:456 1:153 1:152 1:475 1:495 1:737 1:1076 1:102 1:1077 1:1078
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:112 1:113 1:68 1:114
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:75 1:110 1:4 1:111 1:113 1:68 1:115 1:116 1:22
+0:91 0:421 0:104 0:695 0:96 0:696 0:697 1:421 1:104 1:86 1:695 1:96 1:696 1:9 1:65 1:65 1:104 1:86 1:695 1:96 1:696 1:1128
+0:75 0:110 0:4 0:111 0:112 0:113 0:68 0:114 1:113 1:68 1:114 1:86 1:75 1:110
+0:421 0:456 0:153 0:152 0:159 0:457 1:421 1:1227 1:456 1:152 1:14 1:457
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:80 1:81 1:86 1:82 1:118 1:170 1:138 1:48
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:79 1:78 1:80 1:81 1:86 1:8 1:1087 1:84 1:80 1:48
+0:1026 0:1027 0:1028 0:1029 0:1030 0:1031 0:75 0:480 1:1391 1:1392 1:13 1:1393 1:1032 1:1033 1:189 1:4 1:629 1:1034 1:1031 1:48
+0:908 0:159 0:909 0:910 0:109 1:908 1:30 1:155 1:922 1:910 1:109 1:877 1:22
+0:728 0:729 0:509 0:510 0:75 0:68 0:730 0:16 0:731 0:245 1:729 1:732 1:733 1:10 1:120 1:75 1:68 1:730 1:16 1:734
+0:77 0:78 0:79 0:80 0:81 0:82 0:83 0:84 0:85 0:86 0:48 1:94 1:78 1:80 1:81 1:65 1:65 1:58 1:94 1:84 1:85 1:206 1:14 1:85 1:22
--- a/models/match/multiview-simnet/data/train/train.txt
+++ b/models/match/multiview-simnet/data/train/train.txt
--- a/models/match/multiview-simnet/evaluate_reader.py
+++ b/models/match/multiview-simnet/evaluate_reader.py
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -40,8 +39,8 @@ class Reader(ReaderBase):
            padding = 0
            output = [(slot, []) for slot in self.all_slots]
            for elem in elements:
-                feasign, slot = elem.split(':')
-                if not self._all_slots_dict.has_key(slot):
+                slot, feasign = elem.split(':')
+                if slot not in self._all_slots_dict:
                    continue
                self._all_slots_dict[slot][0] = True
                index = self._all_slots_dict[slot][1]

--- a/models/match/multiview-simnet/generate_synthetic_data.py
+++ b/models/match/multiview-simnet/generate_synthetic_data.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-
-class Dataset:
-    def __init__(self):
-        pass
-
-
-class SyntheticDataset(Dataset):
-    def __init__(self,
-                 sparse_feature_dim,
-                 query_slot_num,
-                 title_slot_num,
-                 dataset_size=10000):
-        # ids are randomly generated
-        self.ids_per_slot = 10
-        self.sparse_feature_dim = sparse_feature_dim
-        self.query_slot_num = query_slot_num
-        self.title_slot_num = title_slot_num
-        self.dataset_size = dataset_size
-
-    def _reader_creator(self, is_train):
-        def generate_ids(num, space):
-            return [random.randint(0, space - 1) for i in range(num)]
-
-        def reader():
-            for i in range(self.dataset_size):
-                query_slots = []
-                pos_title_slots = []
-                neg_title_slots = []
-                for i in range(self.query_slot_num):
-                    qslot = generate_ids(self.ids_per_slot,
-                                         self.sparse_feature_dim)
-                    qslot = [str(fea) + ':' + str(i) for fea in qslot]
-                    query_slots += qslot
-                for i in range(self.title_slot_num):
-                    pt_slot = generate_ids(self.ids_per_slot,
-                                           self.sparse_feature_dim)
-                    pt_slot = [
-                        str(fea) + ':' + str(i + self.query_slot_num)
-                        for fea in pt_slot
-                    ]
-                    pos_title_slots += pt_slot
-                if is_train:
-                    for i in range(self.title_slot_num):
-                        nt_slot = generate_ids(self.ids_per_slot,
-                                               self.sparse_feature_dim)
-                        nt_slot = [
-                            str(fea) + ':' +
-                            str(i + self.query_slot_num + self.title_slot_num)
-                            for fea in nt_slot
-                        ]
-                        neg_title_slots += nt_slot
-                    yield query_slots + pos_title_slots + neg_title_slots
-                else:
-                    yield query_slots + pos_title_slots
-
-        return reader
-
-    def train(self):
-        return self._reader_creator(True)
-
-    def valid(self):
-        return self._reader_creator(True)
-
-    def test(self):
-        return self._reader_creator(False)
-
-
-if __name__ == '__main__':
-    sparse_feature_dim = 1000001
-    query_slots = 1
-    title_slots = 1
-    dataset_size = 10
-    dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots,
-                               dataset_size)
-    train_reader = dataset.train()
-    test_reader = dataset.test()
-
-    with open("data/train/train.txt", 'w') as fout:
-        for data in train_reader():
-            fout.write(' '.join(data))
-            fout.write("\n")
-
-    with open("data/test/test.txt", 'w') as fout:
-        for data in test_reader():
-            fout.write(' '.join(data))
-            fout.write("\n")
--- a/models/match/multiview-simnet/reader.py
+++ b/models/match/multiview-simnet/reader.py
@@ -43,8 +43,8 @@ class Reader(ReaderBase):
            padding = 0
            output = [(slot, []) for slot in self.all_slots]
            for elem in elements:
-                feasign, slot = elem.split(':')
-                if not self._all_slots_dict.has_key(slot):
+                slot, feasign = elem.split(':')
+                if slot not in self._all_slots_dict:
                    continue
                self._all_slots_dict[slot][0] = True
                index = self._all_slots_dict[slot][1]

--- a/models/match/multiview-simnet/readme.md
+++ b/models/match/multiview-simnet/readme.md
+# multiview-simnet文本匹配模型
+
+以下是本例的简要目录结构及说明： 
+
+```
+├── data #样例数据
+	├── train
+		├── train.txt #训练数据样例
+	├── test
+    	├── test.txt #测试数据样例
+	├── preprocess.py #数据处理程序
+├── __init__.py
+├── README.md #文档
+├── model.py #模型文件
+├── config.yaml #配置文件
+├── run.sh #运行脚本,在效果复现时使用
+├── transform.py #整理格式准备计算指标的程序
+├── reader.py #读者需要自定义数据集时供读者参考
+├── evaluate_reader.py #读者需要自定义数据集时供读者参考
+```
+注：在阅读该示例前，建议您先了解以下内容：
+
+[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [运行环境](#运行环境)
+- [快速开始](#快速开始)
+- [效果复现](#效果复现)
+- [进阶使用](#进阶使用)
+- [FAQ](#FAQ)
+
+
+## 模型简介
+在个性化推荐场景中，推荐系统给用户提供的项目（Item）列表通常是通过个性化的匹配模型计算出来的。在现实世界中，一个用户可能有很多个视角的特征，比如用户Id，年龄，项目的点击历史等。一个项目，举例来说，新闻资讯，也会有多种视角的特征比如新闻标题，新闻类别等。多视角Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。 多视角Simnet模型包括多个编码器模块，每个编码器被用在不同的特征视角上。当前，项目中提供Bag-of-Embedding编码器，Temporal-Convolutional编码器，和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法，当前采用的是Pairwise ranking模式进行训练，即针对一对具有关联的User-Item组合，随机实用一个Item作为负例进行排序学习。 
+
+模型的具体细节可以阅读论文[MultiView-Simnet](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp1159-songA.pdf):
+<p align="center">
+<img align="center" src="../../../doc/imgs/multiview-simnet.png">
+<p>
+
+## 数据准备
+我们公开了自建的测试集，包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
+```
+wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
+tar xzf simnet_dataset-1.0.0.tar.gz
+rm simnet_dataset-1.0.0.tar.gz
+```
+
+数据格式为一个标识句子的slot，后跟一个句子中词的token。两者形成{slot：token}的形式标识一个词：  
+```
+0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:1144 1:217 1:206 1:9 1:3 1:207 1:10 1:398 1:2 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
+0:358 0:206 0:205 0:250 0:9 0:3 0:207 0:10 0:330 0:164 1:951 1:952 1:206 1:9 1:3 1:207 1:10 1:398 2:217 2:206 2:9 2:3 2:207 2:10 2:398 2:2
+```
+
+## 运行环境
+PaddlePaddle>=1.7.2  
+python 2.7  
+PaddleRec >=0.1  
+os : linux  
+
+## 快速开始
+本文提供了样例数据可以供您快速体验，在paddlerec目录下直接执行下面的命令即可启动训练： 
+
+```
+python -m paddlerec.run -m models/match/multiview-simnet/config.yaml
+```   
+
+
+## 效果复现
+为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
+1. 确认您当前所在目录为PaddleRec/models/match/multiview-simnet
+2. 在data目录下载并解压数据集，命令如下：  
+``` 
+cd data
+wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
+tar xzf simnet_dataset-1.0.0.tar.gz
+rm -f simnet_dataset-1.0.0.tar.gz
+mv data/zhidao ./
+rm -rf data
+```
+3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本，您在解压数据集后，可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt,label.txt和testquery.txt。将其放入train和test目录下以备训练时调用。命令如下：
+```
+python3 preprocess.py
+rm -f ./train/train.txt
+mv train.txt ./train
+rm -f ./test/test.txt
+mv test.txt ./test
+cd ..
+```
+4. 退回tagspace目录中，打开文件config.yaml,更改其中的参数  
+
+    将workspace改为您当前的绝对路径。（可用pwd命令获取绝对路径）  
+
+5.  执行脚本，开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练，并将结果输出到result文件中。然后启动格式整理程序transform，最后计算正逆序比：
+```
+sh run.sh
+```
+
+运行结果大致如下：
+```
+................run.................
+!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
+CPU_NUM indicates that how many CPUPlace are used in the current task.
+And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
+
+export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
+
+!!! The default number of CPU_NUM=1.
+I0821 14:24:57.255358  7888 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
+I0821 14:24:57.259166  7888 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
+I0821 14:24:57.262634  7888 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
+I0821 14:24:57.264791  7888 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
+103
+pnr: 1.17674418605
+query_num: 11
+pair_num: 468 468
+equal_num: 0
+正序率： 0.540598290598
+253 215
+```
+6. 提醒：因为采取较小的数据集进行训练和测试，得到指标的浮动程度会比较大。如果得到的指标不合预期，可以多次执行步骤5，即可获得合理的指标。
+## 进阶使用
+  
+## FAQ
--- a/models/match/multiview-simnet/data_process.sh
+++ b/models/match/multiview-simnet/data_process.sh
-#! /bin/bash
-
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
-set -e
-echo "begin to prepare data"
-
-mkdir -p data/train
-mkdir -p data/test
-
-python generate_synthetic_data.py 
+#!/bin/bash
+echo "................run................."
+python -m paddlerec.run -m ./config.yaml >result1.txt
+grep -i "query_pt_sim" ./result1.txt >./result2.txt
+sed '$d' result2.txt >result.txt
+rm -f result1.txt
+rm -f result2.txt
+python transform.py
+sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
+rm -f pair.txt
+python ../../../tools/cal_pos_neg.py result.txt
--- a/models/match/multiview-simnet/transform.py
+++ b/models/match/multiview-simnet/transform.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+
+label = []
+filename = './data/label.txt'
+f = open(filename, "r")
+f.readline()
+num = 0
+for line in f.readlines():
+    num = num + 1
+    line = line.strip()
+    label.append(line)
+f.close()
+print(num)
+
+filename = './result.txt'
+sim = []
+for line in open(filename):
+    line = line.strip().split(",")
+    line[1] = line[1].split(":")
+    line = line[1][1].strip(" ")
+    line = line.strip("[")
+    line = line.strip("]")
+    sim.append(float(line))
+
+filename = './data/testquery.txt'
+f = open(filename, "r")
+f.readline()
+query = []
+for line in f.readlines():
+    line = line.strip()
+    query.append(line)
+f.close()
+
+filename = 'pair.txt'
+f = open(filename, "w")
+for i in range(len(sim)):
+    f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
+f.close()
--- a/tools/cal_pos_neg.py
+++ b/tools/cal_pos_neg.py
@@ -11,14 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#!/usr/bin/python  
-#-*- coding:utf-8 -*-  
-############################  
-#File Name: cal_pos_neg.py
-#Author: youqiheng 
-#Mail: youqiheng@baidu.com
-#Created Time: 2018-04-15 21:59:45
-############################ 
+#!/usr/bin/python   
+#-*- coding:utf-8 -*-   
 """
 docstring
 """