From 64a60fd5b68d27e071b01e4707694c6aabcee961 Mon Sep 17 00:00:00 2001 From: overlordmax <515704170@qq.com> Date: Wed, 22 Apr 2020 15:59:25 +0800 Subject: [PATCH] fix some bugs --- PaddleRec/{ => ctr}/wide_deep/README.md | 0 PaddleRec/{ => ctr}/wide_deep/args.py | 0 PaddleRec/{ => ctr}/wide_deep/create_data.sh | 0 .../{ => ctr}/wide_deep/data_preparation.py | 0 PaddleRec/{ => ctr}/wide_deep/infer.py | 0 PaddleRec/{ => ctr}/wide_deep/infer_cpu.sh | 0 PaddleRec/{ => ctr}/wide_deep/infer_gpu.sh | 0 PaddleRec/{ => ctr}/wide_deep/net.py | 0 .../{ => ctr}/wide_deep/requirements.txt | 0 PaddleRec/{ => ctr}/wide_deep/train.py | 0 PaddleRec/{ => ctr}/wide_deep/train_cpu.sh | 0 PaddleRec/{ => ctr}/wide_deep/train_gpu.sh | 0 PaddleRec/{ => ctr}/wide_deep/utils.py | 0 PaddleRec/dssm/README.md | 144 ++++++++++++++++++ PaddleRec/dssm/args.py | 36 +++++ PaddleRec/dssm/dssm.py | 119 +++++++++++++++ PaddleRec/dssm/infer.py | 41 +++++ PaddleRec/dssm/infer_cpu.sh | 2 + PaddleRec/dssm/infer_gpu.sh | 2 + PaddleRec/dssm/train_cpu.sh | 9 ++ PaddleRec/dssm/train_gpu.sh | 9 ++ 21 files changed, 362 insertions(+) rename PaddleRec/{ => ctr}/wide_deep/README.md (100%) rename PaddleRec/{ => ctr}/wide_deep/args.py (100%) rename PaddleRec/{ => ctr}/wide_deep/create_data.sh (100%) rename PaddleRec/{ => ctr}/wide_deep/data_preparation.py (100%) rename PaddleRec/{ => ctr}/wide_deep/infer.py (100%) rename PaddleRec/{ => ctr}/wide_deep/infer_cpu.sh (100%) rename PaddleRec/{ => ctr}/wide_deep/infer_gpu.sh (100%) rename PaddleRec/{ => ctr}/wide_deep/net.py (100%) rename PaddleRec/{ => ctr}/wide_deep/requirements.txt (100%) rename PaddleRec/{ => ctr}/wide_deep/train.py (100%) rename PaddleRec/{ => ctr}/wide_deep/train_cpu.sh (100%) rename PaddleRec/{ => ctr}/wide_deep/train_gpu.sh (100%) rename PaddleRec/{ => ctr}/wide_deep/utils.py (100%) create mode 100644 PaddleRec/dssm/README.md create mode 100644 PaddleRec/dssm/args.py create mode 100644 PaddleRec/dssm/dssm.py create mode 100644 PaddleRec/dssm/infer.py create mode 100644 PaddleRec/dssm/infer_cpu.sh create mode 100644 PaddleRec/dssm/infer_gpu.sh create mode 100644 PaddleRec/dssm/train_cpu.sh create mode 100644 PaddleRec/dssm/train_gpu.sh diff --git a/PaddleRec/wide_deep/README.md b/PaddleRec/ctr/wide_deep/README.md similarity index 100% rename from PaddleRec/wide_deep/README.md rename to PaddleRec/ctr/wide_deep/README.md diff --git a/PaddleRec/wide_deep/args.py b/PaddleRec/ctr/wide_deep/args.py similarity index 100% rename from PaddleRec/wide_deep/args.py rename to PaddleRec/ctr/wide_deep/args.py diff --git a/PaddleRec/wide_deep/create_data.sh b/PaddleRec/ctr/wide_deep/create_data.sh similarity index 100% rename from PaddleRec/wide_deep/create_data.sh rename to PaddleRec/ctr/wide_deep/create_data.sh diff --git a/PaddleRec/wide_deep/data_preparation.py b/PaddleRec/ctr/wide_deep/data_preparation.py similarity index 100% rename from PaddleRec/wide_deep/data_preparation.py rename to PaddleRec/ctr/wide_deep/data_preparation.py diff --git a/PaddleRec/wide_deep/infer.py b/PaddleRec/ctr/wide_deep/infer.py similarity index 100% rename from PaddleRec/wide_deep/infer.py rename to PaddleRec/ctr/wide_deep/infer.py diff --git a/PaddleRec/wide_deep/infer_cpu.sh b/PaddleRec/ctr/wide_deep/infer_cpu.sh similarity index 100% rename from PaddleRec/wide_deep/infer_cpu.sh rename to PaddleRec/ctr/wide_deep/infer_cpu.sh diff --git a/PaddleRec/wide_deep/infer_gpu.sh b/PaddleRec/ctr/wide_deep/infer_gpu.sh similarity index 100% rename from PaddleRec/wide_deep/infer_gpu.sh rename to PaddleRec/ctr/wide_deep/infer_gpu.sh diff --git a/PaddleRec/wide_deep/net.py b/PaddleRec/ctr/wide_deep/net.py similarity index 100% rename from PaddleRec/wide_deep/net.py rename to PaddleRec/ctr/wide_deep/net.py diff --git a/PaddleRec/wide_deep/requirements.txt b/PaddleRec/ctr/wide_deep/requirements.txt similarity index 100% rename from PaddleRec/wide_deep/requirements.txt rename to PaddleRec/ctr/wide_deep/requirements.txt diff --git a/PaddleRec/wide_deep/train.py b/PaddleRec/ctr/wide_deep/train.py similarity index 100% rename from PaddleRec/wide_deep/train.py rename to PaddleRec/ctr/wide_deep/train.py diff --git a/PaddleRec/wide_deep/train_cpu.sh b/PaddleRec/ctr/wide_deep/train_cpu.sh similarity index 100% rename from PaddleRec/wide_deep/train_cpu.sh rename to PaddleRec/ctr/wide_deep/train_cpu.sh diff --git a/PaddleRec/wide_deep/train_gpu.sh b/PaddleRec/ctr/wide_deep/train_gpu.sh similarity index 100% rename from PaddleRec/wide_deep/train_gpu.sh rename to PaddleRec/ctr/wide_deep/train_gpu.sh diff --git a/PaddleRec/wide_deep/utils.py b/PaddleRec/ctr/wide_deep/utils.py similarity index 100% rename from PaddleRec/wide_deep/utils.py rename to PaddleRec/ctr/wide_deep/utils.py diff --git a/PaddleRec/dssm/README.md b/PaddleRec/dssm/README.md new file mode 100644 index 00000000..e19fcbe4 --- /dev/null +++ b/PaddleRec/dssm/README.md @@ -0,0 +1,144 @@ +# DSSM + +``` +├── README.md # 文档 +├── dssm.py # dssm网络结构 +├── args.py # 参数脚本 +├── infer.py # 预测脚本 +├── train_gpu.sh # gpu训练shell脚本 +├── train_cpu.sh # cpu训练shell脚本 +├── infer_gpu.sh # gpu预测shell脚本 +├── infer_cpu.sh # cpu预测shell脚本 +``` + +## 简介 + +DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrough Data》]( https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf )即基于深度网络的语义模型,其核心思想是将query和doc映射到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的,并通过word hashing方法来减少输入向量的维度。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。 + +本项目按照论文的网络结构在paddlepaddle上实现DSSM模型,并构造数据集验证网络的正确性。 + +## 环境 + + PaddlePaddle 1.7.0 + + python3.7 + +## 单机训练 + +GPU环境 + +在train_gpu.sh脚本文件中设置好参数。 + +```sh +CUDA_VISIBLE_DEVICES=0 python dssm.py --use_gpu 1 \ #使用gpu + --batch_size 16 \ #batch大小 + --TRIGRAM_D 1000 \ #输入向量维度 + --L1_N 300 \ #第一层mlp大小 + --L2_N 300 \ #第二层mlp大小 + --L3_N 128 \ #第三层mlp大小 + --Neg 4 \ #负采样个数 + --base_lr 0.01 \ #学习率 + --model_dir 'model_dir' #模型保存路径 +``` + +修改脚本的可执行权限并运行 + +```shell +./train_gpu.sh +``` + +CPU环境 + +在train_cpu.sh脚本文件中设置好参数。 + +```sh +python dssm.py --use_gpu 0 \ #使用cpu + --batch_size 16 \ #batch大小 + --TRIGRAM_D 1000 \ #输入向量维度 + --L1_N 300 \ #第一层mlp大小 + --L2_N 300 \ #第二层mlp大小 + --L3_N 128 \ #第三层mlp大小 + --Neg 4 \ #负采样个数 + --base_lr 0.01 \ #学习率 + --model_dir 'model_dir' #模型保存路径 +``` + +修改脚本的可执行权限并运行 + +``` +./train_cpu.sh +``` + +# 预测 + +GPU环境 + +在infer_gpu.sh脚本文件中设置好参数。 + +```sh +CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ #使用gpu + --model_dir 'model_dir' #模型路径 +``` + +修改脚本的可执行权限并运行 + +```sh +./infer_gpu.sh +``` + +CPU环境 + +在infer_cpu.sh脚本文件中设置好参数。 + +```sh +python infer.py --use_gpu 0 \ #使用cpu + --model_dir 'model_dir' #模型路径 +``` + +修改脚本的可执行权限并运行 + +``` +./infer_cpu.sh +``` + + + +## 模型效果 + +随机构造4个负样本进行训练,可见loss达到收敛状态。 + +```txt +W0422 15:36:37.033936 1627 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 9.2, Runtime API Version: 9.0 +W0422 15:36:37.039381 1627 device_context.cc:245] device: 0, cuDNN Version: 7.3. +2020-04-22 15:36:38,718-INFO: epoch_id: 0, batch_time: 0.02135s, loss: 25.05417 +2020-04-22 15:36:38,734-INFO: epoch_id: 1, batch_time: 0.01645s, loss: 16.14477 +2020-04-22 15:36:38,750-INFO: epoch_id: 2, batch_time: 0.01573s, loss: 12.89269 +2020-04-22 15:36:38,766-INFO: epoch_id: 3, batch_time: 0.01551s, loss: 11.51237 +2020-04-22 15:36:38,785-INFO: epoch_id: 4, batch_time: 0.01890s, loss: 10.70215 +...... + +2020-04-22 15:36:40,267-INFO: epoch_id: 95, batch_time: 0.01512s, loss: 7.13324 +2020-04-22 15:36:40,282-INFO: epoch_id: 96, batch_time: 0.01502s, loss: 7.14063 +2020-04-22 15:36:40,298-INFO: epoch_id: 97, batch_time: 0.01506s, loss: 7.13577 +2020-04-22 15:36:40,314-INFO: epoch_id: 98, batch_time: 0.01512s, loss: 7.13683 +2020-04-22 15:36:40,329-INFO: epoch_id: 99, batch_time: 0.01519s, loss: 7.13883 +``` + +预测阶段可算出query和doc的相似度 + +```txt +W0422 15:40:16.847975 1752 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 9.2, Runtime API Version: 9.0 +W0422 15:40:16.853554 1752 device_context.cc:245] device: 0, cuDNN Version: 7.3. +2020-04-22 15:40:18,589-INFO: query_doc_sim: 0.99267 +2020-04-22 15:40:18,593-INFO: query_doc_sim: 0.99123 +2020-04-22 15:40:18,596-INFO: query_doc_sim: 0.99198 +2020-04-22 15:40:18,599-INFO: query_doc_sim: 0.99010 +2020-04-22 15:40:18,602-INFO: query_doc_sim: 0.98832 +...... +2020-04-22 15:40:18,854-INFO: query_doc_sim: 0.99079 +2020-04-22 15:40:18,857-INFO: query_doc_sim: 0.98585 +2020-04-22 15:40:18,860-INFO: query_doc_sim: 0.98702 +2020-04-22 15:40:18,863-INFO: query_doc_sim: 0.99151 +2020-04-22 15:40:18,867-INFO: query_doc_sim: 0.98917 +``` + diff --git a/PaddleRec/dssm/args.py b/PaddleRec/dssm/args.py new file mode 100644 index 00000000..d47736e8 --- /dev/null +++ b/PaddleRec/dssm/args.py @@ -0,0 +1,36 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--batch_size", type=int, default=16, help="batch_size") + parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') + parser.add_argument('--TRIGRAM_D', type=int, default=1000, help='TRIGRAM_D') + parser.add_argument('--L1_N', type=int, default=300, help='L1_N') + parser.add_argument('--L2_N', type=int, default=300, help='L2_N') + parser.add_argument('--L3_N', type=int, default=128, help='L3_N') + parser.add_argument('--Neg', type=int, default=4, help='Neg') + parser.add_argument('--base_lr', type=float, default=0.01, help='base_lr') + parser.add_argument('--model_dir', type=str, default="model_dir", help='model_dir') + args = parser.parse_args() + return args + diff --git a/PaddleRec/dssm/dssm.py b/PaddleRec/dssm/dssm.py new file mode 100644 index 00000000..4d959f7b --- /dev/null +++ b/PaddleRec/dssm/dssm.py @@ -0,0 +1,119 @@ +import paddle.fluid as fluid +import numpy as np +import sys +import args +import logging +import time + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def fc(tag, data, out_dim, active='prelu'): + + xavier=fluid.initializer.Xavier(uniform=True, fan_in=data.shape[1], fan_out=out_dim) + + out = fluid.layers.fc(input=data, + size=out_dim, + act=active, + param_attr=xavier, + bias_attr =xavier, + name=tag) + return out + +def model(TRIGRAM_D = 1000, L1_N = 300, L2_N = 300, L3_N = 128, Neg = 4): + query = fluid.data(name="query", shape=[-1, TRIGRAM_D], dtype="float32") + doc_pos = fluid.data(name="doc_pos", shape=[-1, TRIGRAM_D], dtype="float32") + doc_negs = [fluid.data(name="doc_neg_" + str(i), shape=[-1, TRIGRAM_D], dtype="float32") for i in range(Neg)] + + active = 'tanh' + query_l1 = fc('query_l1', query, L1_N, active) + doc_pos_l1 = fc('doc_pos_l1', doc_pos, L1_N, active) + + query_l2 = fc('query_l2', query_l1, L2_N, active) + doc_pos_l2 = fc('doc_l2', doc_pos_l1, L2_N, active) + + query_l3 = fc('query_l3', query_l2, L3_N, active) + doc_pos_l3 = fc('doc_l3', doc_pos_l2, L3_N, active) + + neg_doc_sems = [] + for i, doc_neg in enumerate(doc_negs): + doc_neg_l1 = fc('doc_neg_l1_' + str(i), doc_neg, L1_N, active) + doc_neg_l2 = fc('doc_neg_l2_' + str(i), doc_neg_l1, L2_N, active) + doc_neg_l3 = fc('doc_neg_l3_' + str(i), doc_neg_l2, L3_N, active) + + neg_doc_sems.append(doc_neg_l3) + + R_Q_D_p = fluid.layers.cos_sim(query_l3, doc_pos_l3) + R_Q_D_ns = [fluid.layers.cos_sim(query_l3, neg_doc_sem) for neg_doc_sem in neg_doc_sems] + + concat_Rs = fluid.layers.concat(input=[R_Q_D_p] + R_Q_D_ns, axis=-1) + prob = fluid.layers.softmax(concat_Rs, axis=1) + hit_prob = fluid.layers.slice(prob, axes=[0,1], starts=[0,0], ends=[args.batch_size, 1]) + + loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob)) + avg_loss = fluid.layers.mean(x=loss) + + return avg_loss, R_Q_D_p, [query] + [doc_pos] + doc_negs + +args = args.parse_args() +loss,R_Q_D_p, data_list = model(args.TRIGRAM_D,args.L1_N,args.L2_N,args.L3_N,args.Neg) + +sgd = fluid.optimizer.SGD(learning_rate=args.base_lr) +sgd.minimize(loss) + +place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() +exe = fluid.Executor(place) +exe.run(fluid.default_startup_program()) + +# Build a random data set. +sample_size = 100 +l_Qs = [] +pos_l_Ds = [] + + +for i in range(sample_size): + l_Q = np.random.rand(args.batch_size, args.TRIGRAM_D) + l_Qs.append(l_Q) + + l_D = np.random.rand(args.batch_size, args.TRIGRAM_D) + pos_l_Ds.append(l_D) + +neg_l_Ds = [[] for i in range(args.Neg)] +for i in range(sample_size): + possibilities = list(range(sample_size)) + possibilities.remove(i) + negatives = np.random.choice(possibilities, args.Neg, replace = False) + for j in range(args.Neg): + negative = negatives[j] + neg_l_Ds[j].append(pos_l_Ds[negative]) + +for i in range(sample_size): + begin = time.time() + loss_data = exe.run(fluid.default_main_program(), + feed={ + "query": l_Qs[i].astype('float32').reshape(args.batch_size,args.TRIGRAM_D), + "doc_pos": pos_l_Ds[i].astype('float32').reshape(args.batch_size,args.TRIGRAM_D), + "doc_neg_0": neg_l_Ds[0][i].astype('float32'), + "doc_neg_1": neg_l_Ds[1][i].astype('float32'), + "doc_neg_2": neg_l_Ds[2][i].astype('float32'), + "doc_neg_3": neg_l_Ds[3][i].astype('float32'), + }, + return_numpy=True, + fetch_list=[loss.name]) + + end = time.time() + logger.info("epoch_id: {}, batch_time: {:.5f}s, loss: {:.5f}".format(i, end-begin, float(np.array(loss_data)))) + +feed_var_names = ["query", "doc_pos"] +fetch_vars = [R_Q_D_p] +fluid.io.save_inference_model(args.model_dir, feed_var_names, fetch_vars, exe) + + + + + + + + + diff --git a/PaddleRec/dssm/infer.py b/PaddleRec/dssm/infer.py new file mode 100644 index 00000000..55cfd43a --- /dev/null +++ b/PaddleRec/dssm/infer.py @@ -0,0 +1,41 @@ +import paddle.fluid as fluid +import numpy as np +import sys +import args +import logging + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def infer(args): + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + with fluid.scope_guard(fluid.Scope()): + infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(args.model_dir, exe) + + #构造测试数据 + sample_size = 100 + l_Qs = [] + pos_l_Ds = [] + + for i in range(sample_size): + l_Q = np.random.rand(1, args.TRIGRAM_D) + l_Qs.append(l_Q) + + l_D = np.random.rand(1, args.TRIGRAM_D) + pos_l_Ds.append(l_D) + + res = [] + for i in range(sample_size): + con_sim = exe.run(infer_program, + feed={"query": l_Qs[i].astype('float32').reshape(1,args.TRIGRAM_D), + "doc_pos": pos_l_Ds[i].astype('float32').reshape(1,args.TRIGRAM_D)}, + fetch_list=fetch_vars, + return_numpy=True) + + logger.info("query_doc_sim: {:.5f}".format(np.array(con_sim).reshape(-1,1)[0][0])) + +if __name__ == "__main__": + args = args.parse_args() + infer(args) \ No newline at end of file diff --git a/PaddleRec/dssm/infer_cpu.sh b/PaddleRec/dssm/infer_cpu.sh new file mode 100644 index 00000000..ce380496 --- /dev/null +++ b/PaddleRec/dssm/infer_cpu.sh @@ -0,0 +1,2 @@ +python infer.py --use_gpu 0 \ + --model_dir 'model_dir' \ No newline at end of file diff --git a/PaddleRec/dssm/infer_gpu.sh b/PaddleRec/dssm/infer_gpu.sh new file mode 100644 index 00000000..dcce70b3 --- /dev/null +++ b/PaddleRec/dssm/infer_gpu.sh @@ -0,0 +1,2 @@ +CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ + --model_dir 'model_dir' \ No newline at end of file diff --git a/PaddleRec/dssm/train_cpu.sh b/PaddleRec/dssm/train_cpu.sh new file mode 100644 index 00000000..1ac62be7 --- /dev/null +++ b/PaddleRec/dssm/train_cpu.sh @@ -0,0 +1,9 @@ +python dssm.py --use_gpu 0 \ + --batch_size 16 \ + --TRIGRAM_D 1000 \ + --L1_N 300 \ + --L2_N 300 \ + --L3_N 128 \ + --Neg 4 \ + --base_lr 0.01 \ + --model_dir 'model_dir' \ No newline at end of file diff --git a/PaddleRec/dssm/train_gpu.sh b/PaddleRec/dssm/train_gpu.sh new file mode 100644 index 00000000..7c80df1b --- /dev/null +++ b/PaddleRec/dssm/train_gpu.sh @@ -0,0 +1,9 @@ +CUDA_VISIBLE_DEVICES=0 python dssm.py --use_gpu 1 \ + --batch_size 16 \ + --TRIGRAM_D 1000 \ + --L1_N 300 \ + --L2_N 300 \ + --L3_N 128 \ + --Neg 4 \ + --base_lr 0.01 \ + --model_dir 'model_dir' \ No newline at end of file -- GitLab