diff --git a/models/match/dssm/config.yaml b/models/match/dssm/config.yaml index 67492fb81d2799d9a4cad6653773039122f6615d..cf68f99320ae2879a129f97650ca4019ff5585bd 100755 --- a/models/match/dssm/config.yaml +++ b/models/match/dssm/config.yaml @@ -37,11 +37,11 @@ hyper_parameters: fc_sizes: [300, 300, 128] fc_acts: ['tanh', 'tanh', 'tanh'] -mode: runner1 +mode: train_runner # config of each runner. # runner is a kind of paddle training class, which wraps the train/infer process. runner: -- name: runner1 +- name: train_runner class: single_train # num of epochs epochs: 4 @@ -55,7 +55,7 @@ runner: save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference init_model_path: "" # load model path fetch_period: 2 -- name: runner2 +- name: infer_runner class: single_infer # num of epochs epochs: 1 diff --git a/models/match/dssm/readme.md b/models/match/dssm/readme.md deleted file mode 100644 index db6e43a7b77b972bd99dd363e86c23b1e4716a07..0000000000000000000000000000000000000000 --- a/models/match/dssm/readme.md +++ /dev/null @@ -1,52 +0,0 @@ -# DSSM - -## 简介 - -DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrough Data》]( https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf )即基于深度网络的语义模型,其核心思想是将query和doc映射到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的,并通过word hashing方法来减少输入向量的维度。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。 - -本项目按照论文的网络结构在paddlepaddle上实现DSSM模型,并构造数据集验证网络的正确性。 - -## 模型超参 -``` -optimizer: - class: sgd # 优化器 - learning_rate: 0.01 # 学习率 - strategy: async # 参数更新方式 -TRIGRAM_D: 1000 # query和doc语义向量长度 -NEG: 4 # 负采样个数 -fc_sizes: [300, 300, 128] # fc层大小 -fc_acts: ['tanh', 'tanh', 'tanh'] # fc层激活函数 - -``` - -## 快速开始 -PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下: -```bash -python -m paddlerec.run -m paddlerec.models.match.dssm -``` - -执行预测前,需更改config.yaml中的配置,具体改动如下: -``` -workspace: "~/code/paddlerec/models/match/dssm" # 改为当前config.yaml所在的绝对路径 - -#mode: runner1 # train -mode: runner2 # infer - -runner: -- name: runner2 - class: single_infer - init_model_path: "increment/2" # 改为需要预测的模型路径 - -phase: -- name: phase1 - model: "{workspace}/model.py" - dataset_name: dataset_infer # 改成预测dataset - thread_num: 1 # dataset线程数 -``` -改完之后,执行预测命令: -``` -python -m paddlerec.run -m ./config.yaml -``` - -## 提测说明 -当前,DSSM模型采用的数据集是随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。 diff --git a/models/match/multiview-simnet/config.yaml b/models/match/multiview-simnet/config.yaml index 43f0b27f00eb0079a4777a3eec122e588fd7c802..276ddae156b36b27a46bd370bb79161575f0c558 100755 --- a/models/match/multiview-simnet/config.yaml +++ b/models/match/multiview-simnet/config.yaml @@ -44,11 +44,11 @@ hyper_parameters: margin: 0.1 # select runner by name -mode: runner1 +mode: train_runner # config of each runner. # runner is a kind of paddle training class, which wraps the train/infer process. runner: -- name: runner1 +- name: train_runner class: single_train # num of epochs epochs: 2 @@ -62,7 +62,7 @@ runner: save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path fetch_period: 1 -- name: runner2 +- name: infer_runner class: single_infer # num of epochs epochs: 1 diff --git a/models/match/multiview-simnet/readme.md b/models/match/multiview-simnet/readme.md deleted file mode 100644 index cc09c2f2928219472eae9fd1fd7ccf3971f6662e..0000000000000000000000000000000000000000 --- a/models/match/multiview-simnet/readme.md +++ /dev/null @@ -1,55 +0,0 @@ -# Multi-view Simnet for Personalized recommendation - -## 简介 - -在个性化推荐场景中,推荐系统给用户提供的项目(Item)列表通常是通过个性化的匹配模型计算出来的。在现实世界中,一个用户可能有很多个视角的特征,比如用户Id,年龄,项目的点击历史等。一个项目,举例来说,新闻资讯,也会有多种视角的特征比如新闻标题,新闻类别等。Multi-view Simnet模型是可以融合用户以及推荐项目的多个视角的特征并进行个性化匹配学习的一体化模型。这类模型在很多工业化的场景中都会被使用到,比如百度的Feed产品中。 - -本项目的目标是提供一个在个性化匹配场景下利用Paddle搭建的模型。Multi-view Simnet模型包括多个编码器模块,每个编码器被用在不同的特征视角上。当前,项目中提供Bag-of-Embedding编码器,Temporal-Convolutional编码器,和Gated-Recurrent-Unit编码器。我们会逐渐加入稀疏特征场景下比较实用的编码器到这个项目中。模型的训练方法,当前采用的是Pairwise ranking模式进行训练,即针对一对具有关联的User-Item组合,并随机产出一个Item作为负例进行排序学习。 - -## 模型超参 -``` -optimizer: - class: Adam # 优化器类型 - learning_rate: 0.0001 # 学习率 - strategy: async # 参数更新方式 -query_encoder: "bow" # 用户特征编码器 -title_encoder: "bow" # item特征编码器 -query_encode_dim: 128 # 用户编码器产出的特征维度 -title_encode_dim: 128 # item编码器产出的特征维度 -sparse_feature_dim: 1000001 # 用户特征及item特征,所有特征总个数 -embedding_dim: 128 # 特征维度 -hidden_size: 128 # 隐藏层维度 -margin: 0.1 # max margin for hinge-loss -``` - -## 快速开始 -PaddleRec内置了demo小数据,方便用户快速使用模型,训练命令如下: -```bash -python -m paddlerec.run -m paddlerec.models.match.multiview-simnet -``` - -执行预测前,需更改config.yaml中的配置,具体改动如下: -``` -workspace: "~/code/paddlerec/models/match/multiview-simnet" # 改为当前config.yaml所在的绝对路径 - -#mode: runner1 # train -mode: runner2 # infer - -runner: -- name: runner2 - class: single_infer - init_model_path: "increment/2" # 改为需要预测的模型路径 - -phase: -- name: phase1 - model: "{workspace}/model.py" - dataset_name: dataset_infer # 改成预测dataset - thread_num: 1 # dataset线程数 -``` -改完之后,执行预测命令: -``` -python -m paddlerec.run -m ./config.yaml -``` - -## 提测说明 -当前,Multi-view Simnet模型采用的数据集是机器随机构造的,因此提测仅需按上述步骤在demo数据集上跑通即可。 diff --git a/models/match/readme.md b/models/match/readme.md index d9f91b257d81ffde820a04cad49b56edbd903f6a..5599dfbfcb7638e50d916b7014ed742307dc9717 100755 --- a/models/match/readme.md +++ b/models/match/readme.md @@ -31,9 +31,21 @@

-## 使用教程 -### 训练&预测 +## 使用教程(快速开始) +### 训练 ```shell python -m paddlerec.run -m paddlerec.models.match.dssm # dssm python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet ``` + +### 预测 +```shell +# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 +# 修改对应模型的config.yaml,mode配置infer_runner +# 示例: mode: train_runner -> mode: infer_runner +# infer_runner中 class配置为 class: single_infer +# 修改phase阶段为infer的配置,参照config注释 + +# 修改完config.yaml后 执行: +python -m paddlerec.run -m ./config.yaml # 以dssm为例 +``` diff --git a/models/recall/gnn/config.yaml b/models/recall/gnn/config.yaml index d200ddc37083ab68b1456ce7fd7bef4fb9870985..f4d5b1efdd213c7ef14af170ab124426b5bc14c6 100755 --- a/models/recall/gnn/config.yaml +++ b/models/recall/gnn/config.yaml @@ -36,17 +36,17 @@ hyper_parameters: decay_steps: 3 decay_rate: 0.1 l2: 0.00001 - sparse_feature_nums: 43098 + sparse_feature_number: 43098 sparse_feature_dim: 100 corpus_size: 719470 gnn_propogation_steps: 1 # select runner by name -mode: runner1 +mode: train_runner # config of each runner. # runner is a kind of paddle training class, which wraps the train/infer process. runner: -- name: runner1 +- name: train_runner class: single_train # num of epochs epochs: 2 @@ -60,7 +60,7 @@ runner: save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path fetch_period: 10 -- name: runner2 +- name: infer_runner class: single_infer # num of epochs epochs: 1 diff --git a/models/recall/gnn/raw_data/convert_data.py b/models/recall/gnn/data/convert_data.py similarity index 100% rename from models/recall/gnn/raw_data/convert_data.py rename to models/recall/gnn/data/convert_data.py diff --git a/models/recall/gnn/raw_data/download.py b/models/recall/gnn/data/download.py similarity index 100% rename from models/recall/gnn/raw_data/download.py rename to models/recall/gnn/data/download.py diff --git a/models/recall/gnn/raw_data/preprocess.py b/models/recall/gnn/data/preprocess.py similarity index 100% rename from models/recall/gnn/raw_data/preprocess.py rename to models/recall/gnn/data/preprocess.py diff --git a/models/recall/gnn/data_process.sh b/models/recall/gnn/data_prepare.sh similarity index 82% rename from models/recall/gnn/data_process.sh rename to models/recall/gnn/data_prepare.sh index fc7ed827e0368c59cab8134d22f78e2200980f18..00a3dcebb01f33424ed9e9517967e5cb613bee81 100755 --- a/models/recall/gnn/data_process.sh +++ b/models/recall/gnn/data_prepare.sh @@ -17,7 +17,7 @@ set -e echo "begin to download data" -cd raw_data && python download.py +cd data && python download.py mkdir diginetica python preprocess.py --dataset diginetica @@ -26,8 +26,10 @@ python convert_data.py --data_dir diginetica cat diginetica/train.txt | wc -l >> diginetica/config.txt -mkdir train_data -mv diginetica/train.txt train_data +rm -rf train && mkdir train +mv diginetica/train.txt train -mkdir test_data -mv diginetica/test.txt test_data +rm -rf test && mkdir test +mv diginetica/test.txt test + +mv diginetica/config.txt ./config.txt diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py index 6e7d2ab5de4b31d3154e4a2c27aabdd1a7374555..74ffd7866d92824a0d23aead5bed3d143727381b 100755 --- a/models/recall/gnn/model.py +++ b/models/recall/gnn/model.py @@ -36,7 +36,7 @@ class Model(ModelBase): self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2") self.dict_size = envs.get_global_env( - "hyper_parameters.sparse_feature_nums") + "hyper_parameters.sparse_feature_number") self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size") self.train_batch_size = envs.get_global_env( diff --git a/models/recall/word2vec/config.yaml b/models/recall/word2vec/config.yaml index 7a1452a7a3e7dc31f381a05ec806386179c39b67..e2785555c32d3a2300c36a2eba6e8b030fc172b9 100755 --- a/models/recall/word2vec/config.yaml +++ b/models/recall/word2vec/config.yaml @@ -35,18 +35,18 @@ hyper_parameters: decay_rate: 0.999 class: sgd strategy: async - sparse_feature_number: 85 + sparse_feature_number: 354051 sparse_feature_dim: 300 with_shuffle_batch: False neg_num: 5 window_size: 5 # select runner by name -mode: runner1 +mode: train_runner # config of each runner. # runner is a kind of paddle training class, which wraps the train/infer process. runner: -- name: runner1 +- name: train_runner class: single_train # num of epochs epochs: 2 @@ -60,7 +60,7 @@ runner: save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path fetch_period: 10 -- name: runner2 +- name: infer_runner class: single_infer # num of epochs epochs: 1 diff --git a/models/recall/word2vec/model.py b/models/recall/word2vec/model.py index 16dc94f47730484c0ffcfab998b633e45f185953..5822c90fa8b91342cb6d44d1e5cb1781ef96e9cd 100755 --- a/models/recall/word2vec/model.py +++ b/models/recall/word2vec/model.py @@ -70,13 +70,14 @@ class Model(ModelBase): def embedding_layer(input, table_name, + emb_dim, initializer_instance=None, squeeze=False): emb = fluid.embedding( input=input, is_sparse=True, is_distributed=self.is_distributed, - size=[self.sparse_feature_number, self.sparse_feature_dim], + size=[self.sparse_feature_number, emb_dim], param_attr=fluid.ParamAttr( name=table_name, initializer=initializer_instance), ) if squeeze: @@ -84,12 +85,16 @@ class Model(ModelBase): else: return emb - init_width = 1.0 / self.sparse_feature_dim + init_width = 0.5 / self.sparse_feature_dim emb_initializer = fluid.initializer.Uniform(-init_width, init_width) emb_w_initializer = fluid.initializer.Constant(value=0.0) - input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True) - true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer, + input_emb = embedding_layer(inputs[0], "emb", self.sparse_feature_dim, + emb_initializer, True) + true_emb_w = embedding_layer(inputs[1], "emb_w", + self.sparse_feature_dim, + emb_w_initializer, True) + true_emb_b = embedding_layer(inputs[1], "emb_b", 1, emb_w_initializer, True) if self.with_shuffle_batch: @@ -102,34 +107,74 @@ class Model(ModelBase): neg_emb_w = fluid.layers.reshape( neg_emb_w_concat, shape=[-1, self.neg_num, self.sparse_feature_dim]) + + neg_emb_b_list = [] + for i in range(self.neg_num): + neg_emb_b_list.append( + fluid.contrib.layers.shuffle_batch( + true_emb_b)) # shuffle true_word + neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0) + neg_emb_b_vec = fluid.layers.reshape( + neg_emb_b, shape=[-1, self.neg_num]) else: - neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer) - true_logits = fluid.layers.reduce_sum( - fluid.layers.elementwise_mul(input_emb, true_emb_w), - dim=1, - keep_dim=True) + neg_emb_w = embedding_layer( + inputs[2], "emb_w", self.sparse_feature_dim, emb_w_initializer) + neg_emb_b = embedding_layer(inputs[2], "emb_b", 1, + emb_w_initializer) + neg_emb_b_vec = fluid.layers.reshape( + neg_emb_b, shape=[-1, self.neg_num]) + + true_logits = fluid.layers.elementwise_add( + fluid.layers.reduce_sum( + fluid.layers.elementwise_mul(input_emb, true_emb_w), + dim=1, + keep_dim=True), + true_emb_b) input_emb_re = fluid.layers.reshape( input_emb, shape=[-1, 1, self.sparse_feature_dim]) neg_matmul = fluid.layers.matmul( input_emb_re, neg_emb_w, transpose_y=True) - neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1]) + neg_matmul_re = fluid.layers.reshape( + neg_matmul, shape=[-1, self.neg_num]) + neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec) + #nce loss - logits = fluid.layers.concat([true_logits, neg_logits], axis=0) label_ones = fluid.layers.fill_constant( shape=[fluid.layers.shape(true_logits)[0], 1], value=1.0, dtype='float32') label_zeros = fluid.layers.fill_constant( - shape=[fluid.layers.shape(neg_logits)[0], 1], + shape=[fluid.layers.shape(true_logits)[0], self.neg_num], value=0.0, dtype='float32') - label = fluid.layers.concat([label_ones, label_zeros], axis=0) - loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label) - avg_cost = fluid.layers.reduce_sum(loss) + true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits, + label_ones) + neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits, + label_zeros) + cost = fluid.layers.elementwise_add( + fluid.layers.reduce_sum( + true_xent, dim=1), + fluid.layers.reduce_sum( + neg_xent, dim=1)) + avg_cost = fluid.layers.reduce_mean(cost) self._cost = avg_cost + global_right_cnt = fluid.layers.create_global_var( + name="global_right_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) + global_total_cnt = fluid.layers.create_global_var( + name="global_total_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) + global_right_cnt.stop_gradient = True + global_total_cnt.stop_gradient = True self._metrics["LOSS"] = avg_cost def optimizer(self): @@ -195,27 +240,26 @@ class Model(ModelBase): fluid.layers.equal(pred_idx, label), dtype='float32')) total_cnt = fluid.layers.reduce_sum(label_ones) - # global_right_cnt = fluid.layers.create_global_var( - # name="global_right_cnt", - # persistable=True, - # dtype='float32', - # shape=[1], - # value=0) - # global_total_cnt = fluid.layers.create_global_var( - # name="global_total_cnt", - # persistable=True, - # dtype='float32', - # shape=[1], - # value=0) - # global_right_cnt.stop_gradient = True - # global_total_cnt.stop_gradient = True - - # tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt) - # fluid.layers.assign(tmp1, global_right_cnt) - # tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt) - # fluid.layers.assign(tmp2, global_total_cnt) - - # acc = fluid.layers.elementwise_div( - # global_right_cnt, global_total_cnt, name="total_acc") - acc = fluid.layers.elementwise_div(right_cnt, total_cnt, name="acc") + global_right_cnt = fluid.layers.create_global_var( + name="global_right_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) + global_total_cnt = fluid.layers.create_global_var( + name="global_total_cnt", + persistable=True, + dtype='float32', + shape=[1], + value=0) + global_right_cnt.stop_gradient = True + global_total_cnt.stop_gradient = True + + tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt) + fluid.layers.assign(tmp1, global_right_cnt) + tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt) + fluid.layers.assign(tmp2, global_total_cnt) + + acc = fluid.layers.elementwise_div( + global_right_cnt, global_total_cnt, name="total_acc") self._infer_results['acc'] = acc diff --git a/models/recall/word2vec/prepare_data.sh b/models/recall/word2vec/prepare_data.sh index cfd067350ce1d33112806ab72ca78222381a86f4..f7da1a10decbf944f56110016bf82137188a3456 100755 --- a/models/recall/word2vec/prepare_data.sh +++ b/models/recall/word2vec/prepare_data.sh @@ -22,16 +22,17 @@ tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/ # preprocess data -python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/test_build_dict -python preprocess.py --filter_corpus --dict_path raw_data/test_build_dict --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001 -mkdir thirdparty -mv raw_data/test_build_dict thirdparty/ -mv raw_data/test_build_dict_word_to_id_ thirdparty/ +python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt +python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001 +mv raw_data/word_count_dict.txt data/dict/ +mv raw_data/word_id_dict.txt data/dict/ -python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=train_data +rm -rf data/train/* +rm -rf data/test/* +python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/data # download test data wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar tar xzvf test_dir.tar -C raw_data -mv raw_data/data/test_dir test_data/ +mv raw_data/data/test_dir data/test/ rm -rf raw_data