change

b57543eb · yinhaofeng · fb1a4c95 · b57543eb · b57543eb · b57543eb
8 changed file
--- a/models/match/dssm/config.yaml
+++ b/models/match/dssm/config.yaml
@@ -30,7 +30,7 @@ dataset:
 hyper_parameters:
  optimizer:
    class: sgd
-    learning_rate: 0.01
+    learning_rate: 0.001
    strategy: async
  trigram_d: 1439
  neg_num: 1
@@ -44,7 +44,7 @@ runner:
 - name: train_runner
  class: train
  # num of epochs
-  epochs: 4
+  epochs: 3
  # device to run training or infer
  device: cpu
  save_checkpoint_interval: 1 # save model interval of epochs
@@ -61,7 +61,7 @@ runner:
  # device to run training or infer
  device: cpu
  print_interval: 1
-  init_model_path: "increment/3" # load model path
+  init_model_path: "increment/2" # load model path
  phases: phase2
 # runner will run all the phase in each epoch

--- a/models/match/dssm/data/preprocess.py
+++ b/models/match/dssm/data/preprocess.py
@@ -29,15 +29,11 @@ f = open("./zhidao", "r")
 lines = f.readlines()
 f.close()
-#划分训练集和测试集
 lines = [line.strip().split("\t") for line in lines]
-random.shuffle(lines)
-train_set = lines[:900]
-test_set = lines[900:]
 #建立以query为key，以负例为value的字典
 neg_dict = {}
-for line in train_set:
+for line in lines:
    if line[2] == "0":
        if line[0] in neg_dict:
            neg_dict[line[0]].append(line[1])
@@ -46,31 +42,45 @@ for line in train_set:
 #建立以query为key，以正例为value的字典
 pos_dict = {}
-for line in train_set:
+for line in lines:
    if line[2] == "1":
        if line[0] in pos_dict:
            pos_dict[line[0]].append(line[1])
        else:
            pos_dict[line[0]] = [line[1]]
-#训练集整理为query，pos，neg的格式
+#划分训练集和测试集
-f = open("train.txt", "w")
+query_list = list(pos_dict.keys())
-for query in pos_dict.keys():
+#print(len(query))
+random.shuffle(query_list)
+train_query = query_list[:90]
+test_query = query_list[90:]
+#获得训练集
+train_set = []
+for query in train_query:
    for pos in pos_dict[query]:
        if query not in neg_dict:
            continue
        for neg in neg_dict[query]:
-            f.write(str(query) + "\t" + str(pos) + "\t" + str(neg) + "\n")
+            train_set.append([query, pos, neg])
-f.close()
+random.shuffle(train_set)
-f = open("train.txt", "r")
+#获得测试集
-lines = f.readlines()
+test_set = []
-f.close()
+for query in test_query:
+    for pos in pos_dict[query]:
+        test_set.append([query, pos, 1])
+    if query not in neg_dict:
+        continue
+    for neg in neg_dict[query]:
+        test_set.append([query, pos, 0])
+random.shuffle(test_set)
 #训练集中的query,pos,neg转化为词袋
 f = open("train.txt", "w")
-for line in lines:
+f = open("train.txt", "w")
-    line = line.strip().split("\t")
+for line in train_set:
    query = line[0].strip().split(" ")
    pos = line[1].strip().split(" ")
    neg = line[2].strip().split(" ")
@@ -103,6 +113,6 @@ for line in test_set:
        pos_token[word_dict[word]] = 1
    f.write(','.join([str(x) for x in query_token]) + "\t" + ','.join(
        [str(x) for x in pos_token]) + "\n")
-    fa.write(label + "\n")
+    fa.write(str(label) + "\n")
 f.close()
 fa.close()
--- a/models/match/dssm/data/test/test.txt
+++ b/models/match/dssm/data/test/test.txt
--- a/models/match/dssm/data/train/train.txt
+++ b/models/match/dssm/data/train/train.txt
--- a/models/match/dssm/model.py
+++ b/models/match/dssm/model.py
@@ -94,7 +94,7 @@ class Model(ModelBase):
        prob = fluid.layers.softmax(concat_Rs, axis=1)
        hit_prob = fluid.layers.slice(
-            prob, axes=[0, 1], starts=[0, 0], ends=[128, 1])
+            prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
        loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
        avg_cost = fluid.layers.mean(x=loss)
        self._cost = avg_cost

--- a/models/match/dssm/readme.md
+++ b/models/match/dssm/readme.md
@@ -15,7 +15,9 @@
 ├── config.yaml #配置文件
 ├── synthetic_reader.py #读取训练集的程序
 ├── synthetic_evaluate_reader.py #读取测试集的程序
-├── eval.py #评价脚本
+├── transform.py #将数据整理成合适的格式方便计算指标
+├── run.sh #全量数据集中的训练脚本，从训练到预测并计算指标
 ```
 注：在阅读该示例前，建议您先了解以下内容：
@@ -61,13 +63,57 @@ PaddleRec >=0.1
 os : windows/linux/macos
 ## 快速开始
-本文提供了样例数据可以供您快速体验，直接执行下面的命令即可启动训练： 
+本文提供了样例数据可以供您快速体验，在paddlerec目录下执行下面的命令即可快速启动训练： 
 ```
 python -m paddlerec.run -m models/match/dssm/config.yaml
 ```   
+输出结果示例：
+```
+PaddleRec: Runner train_runner Begin
+Executor Mode: train
+processor_register begin
+Running SingleInstance.
+Running SingleNetwork.
+file_list : ['models/match/dssm/data/train/train.txt']
+Running SingleStartup.
+Running SingleRunner.
+!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
+CPU_NUM indicates that how many CPUPlace are used in the current task.
+And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
+export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
+!!! The default number of CPU_NUM=1.
+I0821 06:56:26.224299 31061 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
+I0821 06:56:26.231163 31061 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
+I0821 06:56:26.237023 31061 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
+I0821 06:56:26.240788 31061 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
+batch: 2, LOSS: [4.538238]
+batch: 4, LOSS: [4.16424]
+batch: 6, LOSS: [3.8121371]
+batch: 8, LOSS: [3.4250507]
+batch: 10, LOSS: [3.2285979]
+batch: 12, LOSS: [3.2116117]
+batch: 14, LOSS: [3.1406002]
+epoch 0 done, use time: 0.357971906662, global metrics: LOSS=[3.0968776]
+batch: 2, LOSS: [2.6843479]
+batch: 4, LOSS: [2.546976]
+batch: 6, LOSS: [2.4103594]
+batch: 8, LOSS: [2.301374]
+batch: 10, LOSS: [2.264183]
+batch: 12, LOSS: [2.315862]
+batch: 14, LOSS: [2.3409634]
+epoch 1 done, use time: 0.22123003006, global metrics: LOSS=[2.344321]
+batch: 2, LOSS: [2.0882485]
+batch: 4, LOSS: [2.006743]
+batch: 6, LOSS: [1.9231766]
+batch: 8, LOSS: [1.8850241]
+batch: 10, LOSS: [1.8829436]
+batch: 12, LOSS: [1.9336565]
+batch: 14, LOSS: [1.9784685]
+epoch 2 done, use time: 0.212922096252, global metrics: LOSS=[1.9934461]
+PaddleRec Finish
+```
 ## 效果复现
 为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
 1. 确认您当前所在目录为PaddleRec/models/match/dssm
@@ -94,16 +140,42 @@ cd ..
 测试集为两个稀疏的BOW方式的向量：query,pos  
 label.txt中对应的测试集中的标签
-4. 退回tagspace目录中，打开文件config.yaml,更改其中的参数  
+4. 退回dssm目录中，打开文件config.yaml,更改其中的参数  
 将workspace改为您当前的绝对路径。（可用pwd命令获取绝对路径）  
 将dataset_train中的batch_size从8改为128
+将文件model.py中的 hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])  
+    改为hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[128, 1]).当您需要改变batchsize的时候，end中第一个参数也需要随之变化
-5.  执行脚本，开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练，并将结果输出到result文件中。然后启动评价脚本eval.py计算auc：
+5.  执行脚本，开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练，并将结果输出到result文件中。然后启动transform.py整合数据，最后计算出正逆序指标：
 ```
 sh run.sh
 ```
+输出结果示例：
+```
+................run.................
+!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
+CPU_NUM indicates that how many CPUPlace are used in the current task.
+And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
+export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
+!!! The default number of CPU_NUM=1.
+I0821 07:16:04.512531 32200 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
+I0821 07:16:04.515708 32200 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
+I0821 07:16:04.518872 32200 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
+I0821 07:16:04.520995 32200 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
+75
+pnr: 2.25581395349
+query_num: 11
+pair_num: 184 184
+equal_num: 44
+正序率： 0.692857142857
+97 43
+```
+6. 提醒：因为采取较小的数据集进行训练和测试，得到指标的浮动程度会比较大。如果得到的指标不合预期，可以多次执行步骤5，即可获得合理的指标。
 ## 进阶使用
 ## FAQ
--- a/models/match/dssm/run.sh
+++ b/models/match/dssm/run.sh
@@ -5,4 +5,7 @@ grep -i "query_doc_sim" ./result1.txt >./result2.txt
 sed '$d' result2.txt >result.txt
 rm -f result1.txt
 rm -f result2.txt
-python eval.py
+python transform.py
+sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
+rm -f pair.txt
+python ../../../tools/cal_pos_neg.py result.txt
--- a/models/match/dssm/eval.py
+++ b/models/match/dssm/eval.py
@@ -24,7 +24,7 @@ num = 0
 for line in f.readlines():
    num = num + 1
    line = line.strip()
-    label.append(float(line))
+    label.append(line)
 f.close()
 print(num)
@@ -38,5 +38,17 @@ for line in open(filename):
    line = line.strip("]")
    sim.append(float(line))
-auc = sklearn.metrics.roc_auc_score(label, sim)
+filename = './data/test/test.txt'
-print("auc = ", auc)
+f = open(filename, "r")
+f.readline()
+query = []
+for line in f.readlines():
+    line = line.strip().split("\t")
+    query.append(line[0])
+f.close()
+filename = 'pair.txt'
+f = open(filename, "w")
+for i in range(len(sim)):
+    f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
+f.close()