From 838928ddd0db0f411ed98c6539a081c776e9ca85 Mon Sep 17 00:00:00 2001 From: yinhaofeng <1841837261@qq.com> Date: Wed, 16 Sep 2020 15:19:01 +0000 Subject: [PATCH] change pyramid --- models/match/dssm/data/preprocess.py | 2 +- models/match/dssm/readme.md | 2 +- models/match/match-pyramid/data/process.py | 4 ++-- models/match/match-pyramid/data_process.sh | 4 +++- models/match/match-pyramid/eval.py | 4 ++-- models/match/match-pyramid/readme.md | 14 ++++++++------ models/match/match-pyramid/run.sh | 4 +++- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/models/match/dssm/data/preprocess.py b/models/match/dssm/data/preprocess.py index 4d6e669c..3423120e 100644 --- a/models/match/dssm/data/preprocess.py +++ b/models/match/dssm/data/preprocess.py @@ -63,7 +63,7 @@ print("build dict done") #划分训练集和测试集 query_list = list(pos_dict.keys()) #print(len(query_list)) -random.shuffle(query_list) +#random.shuffle(query_list) train_query = query_list[:11600] test_query = query_list[11600:] diff --git a/models/match/dssm/readme.md b/models/match/dssm/readme.md index 7cbb8569..5228e634 100644 --- a/models/match/dssm/readme.md +++ b/models/match/dssm/readme.md @@ -157,7 +157,7 @@ label.txt中对应的测试集中的标签 将hyper_parameters中的slice_end从8改为128.当您需要改变batchsize的时候,这个参数也需要随之变化 将dataset_train中的data_path改为{workspace}/data/big_train 将dataset_infer中的data_path改为{workspace}/data/big_test -将hyper_parameters中的trigram_d改为6327 +将hyper_parameters中的trigram_d改为5913 5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动transform.py整合数据,最后计算出正逆序指标: ``` diff --git a/models/match/match-pyramid/data/process.py b/models/match/match-pyramid/data/process.py index 7be9d1fb..9ab8fc65 100644 --- a/models/match/match-pyramid/data/process.py +++ b/models/match/match-pyramid/data/process.py @@ -106,7 +106,7 @@ def make_train(): pair_list.append((d1, high_d2, low_d2)) print('Pair Instance Count:', len(pair_list)) - f = open("./data/train/train.txt", "w") + f = open("./data/big_train/train.txt", "w") for batch in range(800): X1 = np.zeros((batch_size * 2, data1_maxlen), dtype=np.int32) X2 = np.zeros((batch_size * 2, data2_maxlen), dtype=np.int32) @@ -131,7 +131,7 @@ def make_train(): def make_test(): rel = read_relation(filename=os.path.join(Letor07Path, 'relation.test.fold1.txt')) - f = open("./data/test/test.txt", "w") + f = open("./data/big_test/test.txt", "w") for label, d1, d2 in rel: X1 = np.zeros(data1_maxlen, dtype=np.int32) X2 = np.zeros(data2_maxlen, dtype=np.int32) diff --git a/models/match/match-pyramid/data_process.sh b/models/match/match-pyramid/data_process.sh index dfd3a874..24da8d0f 100644 --- a/models/match/match-pyramid/data_process.sh +++ b/models/match/match-pyramid/data_process.sh @@ -3,7 +3,9 @@ echo "...........load data................." wget --no-check-certificate 'https://paddlerec.bj.bcebos.com/match_pyramid/match_pyramid_data.tar.gz' mv ./match_pyramid_data.tar.gz ./data -rm -rf ./data/relation.test.fold1.txt ./data/realtion.train.fold1.txt +rm -rf ./data/relation.test.fold1.txt tar -xvf ./data/match_pyramid_data.tar.gz +mkdir ./data/big_train +mkdir ./data/big_test echo "...........data process..............." python ./data/process.py diff --git a/models/match/match-pyramid/eval.py b/models/match/match-pyramid/eval.py index dae40cef..c3eccdb6 100644 --- a/models/match/match-pyramid/eval.py +++ b/models/match/match-pyramid/eval.py @@ -49,8 +49,8 @@ filename = './result.txt' pred = [] for line in open(filename): line = line.strip().split(",") - line[1] = line[1].split(":") - line = line[1][1].strip(" ") + line[3] = line[3].split(":") + line = line[3][1].strip(" ") line = line.strip("[") line = line.strip("]") pred.append(float(line)) diff --git a/models/match/match-pyramid/readme.md b/models/match/match-pyramid/readme.md index 2960d58e..2ee200a6 100644 --- a/models/match/match-pyramid/readme.md +++ b/models/match/match-pyramid/readme.md @@ -56,10 +56,10 @@ 4.嵌入层文件:我们将预训练的词向量存储在嵌入文件中。例如:embed_wiki-pdc_d50_norm ## 运行环境 -PaddlePaddle>=1.7.2 -python 2.7/3.5/3.6/3.7 -PaddleRec >=0.1 -os : windows/linux/macos +PaddlePaddle>=1.7.2 +python 2.7/3.5/3.6/3.7 +PaddleRec >=0.1 +os : windows/linux/macos ## 快速开始 @@ -72,7 +72,7 @@ python -m paddlerec.run -m models/match/match-pyramid/config.yaml ## 论文复现 1. 确认您当前所在目录为PaddleRec/models/match/match-pyramid 2. 本文提供了原数据集的下载以及一键生成训练和测试数据的预处理脚本,您可以直接一键运行:bash data_process.sh -执行该脚本,会从国内源的服务器上下载Letor07数据集,删除掉data文件夹中原有的relation.test.fold1.txt和relation.train.fold1.txt,并将完整的数据集解压到data文件夹。随后运行 process.py 将全量训练数据放置于`./data/train`,全量测试数据放置于`./data/test`。并生成用于初始化embedding层的embedding.npy文件 +执行该脚本,会从国内源的服务器上下载Letor07数据集,并将完整的数据集解压到data文件夹。随后运行 process.py 将全量训练数据放置于`./data/big_train`,全量测试数据放置于`./data/big_test`。并生成用于初始化embedding层的embedding.npy文件 执行该脚本的理想输出为: ``` bash data_process.sh @@ -123,6 +123,8 @@ data/embed_wiki-pdc_d50_norm 3. 打开文件config.yaml,更改其中的参数 将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径) +将dataset_train下的data_path参数改为{workspace}/data/big_train +将dataset_infer下的data_path参数改为{workspace}/data/big_test 4. 随后,您直接一键运行:bash run.sh 即可得到复现的论文效果 执行该脚本后,会执行python -m paddlerec.run -m ./config.yaml 命令开始训练并测试模型,将测试的结果保存到result.txt文件,最后通过执行eval.py进行评估得到数据的map指标 @@ -131,7 +133,7 @@ data/embed_wiki-pdc_d50_norm ..............test................. 13651 336 -('map=', 0.420878322843591) +('map=', 0.3993127885738651) ``` ## 进阶使用 diff --git a/models/match/match-pyramid/run.sh b/models/match/match-pyramid/run.sh index 0d85def7..ad9f5216 100644 --- a/models/match/match-pyramid/run.sh +++ b/models/match/match-pyramid/run.sh @@ -1,6 +1,8 @@ #!/bin/bash echo "................run................." python -m paddlerec.run -m ./config.yaml &>result1.txt -grep -i "prediction" ./result1.txt >./result.txt +grep -i "prediction" ./result1.txt >./result2.txt +sed '$d' result2.txt >result.txt +rm -f result2.txt rm -f result1.txt python eval.py -- GitLab