提交 b57543eb 编写于 作者: Y yinhaofeng

change

上级 fb1a4c95
...@@ -30,7 +30,7 @@ dataset: ...@@ -30,7 +30,7 @@ dataset:
hyper_parameters: hyper_parameters:
optimizer: optimizer:
class: sgd class: sgd
learning_rate: 0.01 learning_rate: 0.001
strategy: async strategy: async
trigram_d: 1439 trigram_d: 1439
neg_num: 1 neg_num: 1
...@@ -44,7 +44,7 @@ runner: ...@@ -44,7 +44,7 @@ runner:
- name: train_runner - name: train_runner
class: train class: train
# num of epochs # num of epochs
epochs: 4 epochs: 3
# device to run training or infer # device to run training or infer
device: cpu device: cpu
save_checkpoint_interval: 1 # save model interval of epochs save_checkpoint_interval: 1 # save model interval of epochs
...@@ -61,7 +61,7 @@ runner: ...@@ -61,7 +61,7 @@ runner:
# device to run training or infer # device to run training or infer
device: cpu device: cpu
print_interval: 1 print_interval: 1
init_model_path: "increment/3" # load model path init_model_path: "increment/2" # load model path
phases: phase2 phases: phase2
# runner will run all the phase in each epoch # runner will run all the phase in each epoch
......
...@@ -29,15 +29,11 @@ f = open("./zhidao", "r") ...@@ -29,15 +29,11 @@ f = open("./zhidao", "r")
lines = f.readlines() lines = f.readlines()
f.close() f.close()
#划分训练集和测试集
lines = [line.strip().split("\t") for line in lines] lines = [line.strip().split("\t") for line in lines]
random.shuffle(lines)
train_set = lines[:900]
test_set = lines[900:]
#建立以query为key,以负例为value的字典 #建立以query为key,以负例为value的字典
neg_dict = {} neg_dict = {}
for line in train_set: for line in lines:
if line[2] == "0": if line[2] == "0":
if line[0] in neg_dict: if line[0] in neg_dict:
neg_dict[line[0]].append(line[1]) neg_dict[line[0]].append(line[1])
...@@ -46,31 +42,45 @@ for line in train_set: ...@@ -46,31 +42,45 @@ for line in train_set:
#建立以query为key,以正例为value的字典 #建立以query为key,以正例为value的字典
pos_dict = {} pos_dict = {}
for line in train_set: for line in lines:
if line[2] == "1": if line[2] == "1":
if line[0] in pos_dict: if line[0] in pos_dict:
pos_dict[line[0]].append(line[1]) pos_dict[line[0]].append(line[1])
else: else:
pos_dict[line[0]] = [line[1]] pos_dict[line[0]] = [line[1]]
#训练集整理为query,pos,neg的格式 #划分训练集和测试集
f = open("train.txt", "w") query_list = list(pos_dict.keys())
for query in pos_dict.keys(): #print(len(query))
random.shuffle(query_list)
train_query = query_list[:90]
test_query = query_list[90:]
#获得训练集
train_set = []
for query in train_query:
for pos in pos_dict[query]: for pos in pos_dict[query]:
if query not in neg_dict: if query not in neg_dict:
continue continue
for neg in neg_dict[query]: for neg in neg_dict[query]:
f.write(str(query) + "\t" + str(pos) + "\t" + str(neg) + "\n") train_set.append([query, pos, neg])
f.close() random.shuffle(train_set)
f = open("train.txt", "r") #获得测试集
lines = f.readlines() test_set = []
f.close() for query in test_query:
for pos in pos_dict[query]:
test_set.append([query, pos, 1])
if query not in neg_dict:
continue
for neg in neg_dict[query]:
test_set.append([query, pos, 0])
random.shuffle(test_set)
#训练集中的query,pos,neg转化为词袋 #训练集中的query,pos,neg转化为词袋
f = open("train.txt", "w") f = open("train.txt", "w")
for line in lines: f = open("train.txt", "w")
line = line.strip().split("\t") for line in train_set:
query = line[0].strip().split(" ") query = line[0].strip().split(" ")
pos = line[1].strip().split(" ") pos = line[1].strip().split(" ")
neg = line[2].strip().split(" ") neg = line[2].strip().split(" ")
...@@ -103,6 +113,6 @@ for line in test_set: ...@@ -103,6 +113,6 @@ for line in test_set:
pos_token[word_dict[word]] = 1 pos_token[word_dict[word]] = 1
f.write(','.join([str(x) for x in query_token]) + "\t" + ','.join( f.write(','.join([str(x) for x in query_token]) + "\t" + ','.join(
[str(x) for x in pos_token]) + "\n") [str(x) for x in pos_token]) + "\n")
fa.write(label + "\n") fa.write(str(label) + "\n")
f.close() f.close()
fa.close() fa.close()
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
...@@ -94,7 +94,7 @@ class Model(ModelBase): ...@@ -94,7 +94,7 @@ class Model(ModelBase):
prob = fluid.layers.softmax(concat_Rs, axis=1) prob = fluid.layers.softmax(concat_Rs, axis=1)
hit_prob = fluid.layers.slice( hit_prob = fluid.layers.slice(
prob, axes=[0, 1], starts=[0, 0], ends=[128, 1]) prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob)) loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
avg_cost = fluid.layers.mean(x=loss) avg_cost = fluid.layers.mean(x=loss)
self._cost = avg_cost self._cost = avg_cost
......
...@@ -15,7 +15,9 @@ ...@@ -15,7 +15,9 @@
├── config.yaml #配置文件 ├── config.yaml #配置文件
├── synthetic_reader.py #读取训练集的程序 ├── synthetic_reader.py #读取训练集的程序
├── synthetic_evaluate_reader.py #读取测试集的程序 ├── synthetic_evaluate_reader.py #读取测试集的程序
├── eval.py #评价脚本 ├── transform.py #将数据整理成合适的格式方便计算指标
├── run.sh #全量数据集中的训练脚本,从训练到预测并计算指标
``` ```
注:在阅读该示例前,建议您先了解以下内容: 注:在阅读该示例前,建议您先了解以下内容:
...@@ -61,13 +63,57 @@ PaddleRec >=0.1 ...@@ -61,13 +63,57 @@ PaddleRec >=0.1
os : windows/linux/macos os : windows/linux/macos
## 快速开始 ## 快速开始
本文提供了样例数据可以供您快速体验,直接执行下面的命令即可启动训练: 本文提供了样例数据可以供您快速体验,在paddlerec目录下执行下面的命令即可快速启动训练:
``` ```
python -m paddlerec.run -m models/match/dssm/config.yaml python -m paddlerec.run -m models/match/dssm/config.yaml
``` ```
输出结果示例:
```
PaddleRec: Runner train_runner Begin
Executor Mode: train
processor_register begin
Running SingleInstance.
Running SingleNetwork.
file_list : ['models/match/dssm/data/train/train.txt']
Running SingleStartup.
Running SingleRunner.
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 06:56:26.224299 31061 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 06:56:26.231163 31061 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 06:56:26.237023 31061 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 06:56:26.240788 31061 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
batch: 2, LOSS: [4.538238]
batch: 4, LOSS: [4.16424]
batch: 6, LOSS: [3.8121371]
batch: 8, LOSS: [3.4250507]
batch: 10, LOSS: [3.2285979]
batch: 12, LOSS: [3.2116117]
batch: 14, LOSS: [3.1406002]
epoch 0 done, use time: 0.357971906662, global metrics: LOSS=[3.0968776]
batch: 2, LOSS: [2.6843479]
batch: 4, LOSS: [2.546976]
batch: 6, LOSS: [2.4103594]
batch: 8, LOSS: [2.301374]
batch: 10, LOSS: [2.264183]
batch: 12, LOSS: [2.315862]
batch: 14, LOSS: [2.3409634]
epoch 1 done, use time: 0.22123003006, global metrics: LOSS=[2.344321]
batch: 2, LOSS: [2.0882485]
batch: 4, LOSS: [2.006743]
batch: 6, LOSS: [1.9231766]
batch: 8, LOSS: [1.8850241]
batch: 10, LOSS: [1.8829436]
batch: 12, LOSS: [1.9336565]
batch: 14, LOSS: [1.9784685]
epoch 2 done, use time: 0.212922096252, global metrics: LOSS=[1.9934461]
PaddleRec Finish
```
## 效果复现 ## 效果复现
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。 为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
1. 确认您当前所在目录为PaddleRec/models/match/dssm 1. 确认您当前所在目录为PaddleRec/models/match/dssm
...@@ -94,16 +140,42 @@ cd .. ...@@ -94,16 +140,42 @@ cd ..
测试集为两个稀疏的BOW方式的向量:query,pos 测试集为两个稀疏的BOW方式的向量:query,pos
label.txt中对应的测试集中的标签 label.txt中对应的测试集中的标签
4. 退回tagspace目录中,打开文件config.yaml,更改其中的参数 4. 退回dssm目录中,打开文件config.yaml,更改其中的参数
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径) 将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
将dataset_train中的batch_size从8改为128 将dataset_train中的batch_size从8改为128
将文件model.py中的 hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
改为hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[128, 1]).当您需要改变batchsize的时候,end中第一个参数也需要随之变化
5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动评价脚本eval.py计算auc 5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动transform.py整合数据,最后计算出正逆序指标
``` ```
sh run.sh sh run.sh
``` ```
输出结果示例:
```
................run.................
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
CPU_NUM indicates that how many CPUPlace are used in the current task.
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
!!! The default number of CPU_NUM=1.
I0821 07:16:04.512531 32200 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
I0821 07:16:04.515708 32200 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
I0821 07:16:04.518872 32200 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
I0821 07:16:04.520995 32200 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
75
pnr: 2.25581395349
query_num: 11
pair_num: 184 184
equal_num: 44
正序率: 0.692857142857
97 43
```
6. 提醒:因为采取较小的数据集进行训练和测试,得到指标的浮动程度会比较大。如果得到的指标不合预期,可以多次执行步骤5,即可获得合理的指标。
## 进阶使用 ## 进阶使用
## FAQ ## FAQ
...@@ -5,4 +5,7 @@ grep -i "query_doc_sim" ./result1.txt >./result2.txt ...@@ -5,4 +5,7 @@ grep -i "query_doc_sim" ./result1.txt >./result2.txt
sed '$d' result2.txt >result.txt sed '$d' result2.txt >result.txt
rm -f result1.txt rm -f result1.txt
rm -f result2.txt rm -f result2.txt
python eval.py python transform.py
sort -t $'\t' -k1,1 -k 2nr,2 pair.txt >result.txt
rm -f pair.txt
python ../../../tools/cal_pos_neg.py result.txt
...@@ -24,7 +24,7 @@ num = 0 ...@@ -24,7 +24,7 @@ num = 0
for line in f.readlines(): for line in f.readlines():
num = num + 1 num = num + 1
line = line.strip() line = line.strip()
label.append(float(line)) label.append(line)
f.close() f.close()
print(num) print(num)
...@@ -38,5 +38,17 @@ for line in open(filename): ...@@ -38,5 +38,17 @@ for line in open(filename):
line = line.strip("]") line = line.strip("]")
sim.append(float(line)) sim.append(float(line))
auc = sklearn.metrics.roc_auc_score(label, sim) filename = './data/test/test.txt'
print("auc = ", auc) f = open(filename, "r")
f.readline()
query = []
for line in f.readlines():
line = line.strip().split("\t")
query.append(line[0])
f.close()
filename = 'pair.txt'
f = open(filename, "w")
for i in range(len(sim)):
f.write(str(query[i]) + "\t" + str(sim[i]) + "\t" + str(label[i]) + "\n")
f.close()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册