diff --git a/dssm/README.cn.md b/dssm/README.cn.md index b65c11df7d00f34b8378c92371858ca383827a1d..17f9a923c0418c72194f4f54316f04b70ba443dc 100644 --- a/dssm/README.cn.md +++ b/dssm/README.cn.md @@ -13,7 +13,7 @@ DSSM \[[1](##参考文献)\]是微软研究院13年提出来的经典的语义 DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间的距离关系, 例如对于文本相关性问题,可以用余弦相似度 (cosin similarity) 来刻画语义距离; -而对于搜索引擎的结果排序,可以在DSSM上接上Rank损失训练处一个排序模型。 +而对于搜索引擎的结果排序,可以在DSSM上接上Rank损失训练出一个排序模型。 ## 模型简介 在原论文\[[1](#参考文献)\]中,DSSM模型用来衡量用户搜索词 Query 和文档集合 Documents 之间隐含的语义关系,模型结构如下 @@ -24,7 +24,7 @@ DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间

其贯彻的思想是, **用DNN将高维特征向量转化为低纬空间的连续向量(图中红色框部分)** , -**在上层用cosin similarity来衡量用户搜索词与候选文档间的语义相关性** 。 +**在上层用cosine similarity来衡量用户搜索词与候选文档间的语义相关性** 。 在最顶层损失函数的设计上,原始模型使用类似Word2Vec中负例采样的方法, 一个Query会抽取正例 $D+$ 和4个负例 $D-$ 整体上算条件概率用对数似然函数作为损失, @@ -165,7 +165,13 @@ def create_rnn(self, emb, prefix=''): ''' A GRU sentence vector learner. ''' - gru = paddle.layer.gru_memory(input=emb,) + gru = paddle.networks.simple_gru( + input=emb, + size=self.dnn_dims[1], + mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix), + mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix), + gru_param_attr=ParamAttr(name='%s_gru.w' % prefix), + gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix)) sent_vec = paddle.layer.last_seq(gru) return sent_vec ``` @@ -184,7 +190,11 @@ def create_fc(self, emb, prefix=''): ''' _input_layer = paddle.layer.pooling( input=emb, pooling_type=paddle.pooling.Max()) - fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1]) + fc = paddle.layer.fc( + input=_input_layer, + size=self.dnn_dims[1], + param_attr=ParamAttr(name='%s_fc.w' % prefix), + bias_attr=ParamAttr(name="%s_fc.b" % prefix)) return fc ``` @@ -206,7 +216,6 @@ def create_dnn(self, sent_vec, prefix): fc = paddle.layer.fc( input=_input_layer, size=dim, - name=name, act=paddle.activation.Tanh(), param_attr=ParamAttr(name='%s.w' % name), bias_attr=ParamAttr(name='%s.b' % name), @@ -244,9 +253,9 @@ def _build_classification_or_regression_model(self, is_classification): if is_classification else paddle.data_type.dense_input) prefixs = '_ _'.split( - ) if self.share_semantic_generator else 'left right'.split() + ) if self.share_semantic_generator else 'source target'.split() embed_prefixs = '_ _'.split( - ) if self.share_embed else 'left right'.split() + ) if self.share_embed else 'source target'.split() word_vecs = [] for id, input in enumerate([source, target]): @@ -258,16 +267,21 @@ def _build_classification_or_regression_model(self, is_classification): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) - concated_vector = paddle.layer.concat(semantics) - prediction = paddle.layer.fc( - input=concated_vector, - size=self.class_num, - act=paddle.activation.Softmax()) - cost = paddle.layer.classification_cost( - input=prediction, - label=label) if is_classification else paddle.layer.mse_cost( - prediction, label) - return cost, prediction, label + if is_classification: + concated_vector = paddle.layer.concat(semantics) + prediction = paddle.layer.fc( + input=concated_vector, + size=self.class_num, + act=paddle.activation.Softmax()) + cost = paddle.layer.classification_cost( + input=prediction, label=label) + else: + prediction = paddle.layer.cos_sim(*semantics) + cost = paddle.layer.square_error_cost(prediction, label) + + if not self.is_infer: + return cost, prediction, label + return prediction ``` ### Pairwise Rank实现 Pairwise Rank复用上面的DNN结构,同一个source对两个target求相似度打分, @@ -297,7 +311,7 @@ def _build_rank_model(self): name='label_input', type=paddle.data_type.integer_value(1)) prefixs = '_ _ _'.split( - ) if self.share_semantic_generator else 'source left right'.split() + ) if self.share_semantic_generator else 'source target target'.split() embed_prefixs = '_ _'.split( ) if self.share_embed else 'source target target'.split() @@ -406,7 +420,7 @@ optional arguments: path of the target's word dic, if not set, the `source_dic_path` will be used -b BATCH_SIZE, --batch_size BATCH_SIZE - size of mini-batch (default:10) + size of mini-batch (default:32) -p NUM_PASSES, --num_passes NUM_PASSES number of passes to run(default:10) -y MODEL_TYPE, --model_type MODEL_TYPE diff --git a/dssm/infer.py b/dssm/infer.py index dc5595abceae44f985ab616025c8488d46456b8c..f0c65e44a8c5f9249172f0c1912dc9c195ce69c2 100644 --- a/dssm/infer.py +++ b/dssm/infer.py @@ -1,5 +1,6 @@ import argparse import itertools +import distutils.util import reader import paddle.v2 as paddle @@ -56,12 +57,12 @@ parser.add_argument( (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE)) parser.add_argument( '--share_network_between_source_target', - type=bool, + type=distutils.util.strtobool, default=False, help="whether to share network parameters between source and target") parser.add_argument( '--share_embed', - type=bool, + type=distutils.util.strtobool, default=False, help="whether to share word embedding between source and target") parser.add_argument( diff --git a/dssm/network_conf.py b/dssm/network_conf.py index 10c8974f24f98ce05eb93c85559152f5eb274d60..8e45ef81d9408a597aa44af8521429c26f1e2697 100644 --- a/dssm/network_conf.py +++ b/dssm/network_conf.py @@ -96,14 +96,24 @@ class DSSM(object): ''' _input_layer = paddle.layer.pooling( input=emb, pooling_type=paddle.pooling.Max()) - fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1]) + fc = paddle.layer.fc( + input=_input_layer, + size=self.dnn_dims[1], + param_attr=ParamAttr(name='%s_fc.w' % prefix), + bias_attr=ParamAttr(name="%s_fc.b" % prefix)) return fc def create_rnn(self, emb, prefix=''): ''' A GRU sentence vector learner. ''' - gru = paddle.networks.simple_gru(input=emb, size=256) + gru = paddle.networks.simple_gru( + input=emb, + size=self.dnn_dims[1], + mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix), + mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix), + gru_param_attr=ParamAttr(name='%s_gru.w' % prefix), + gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix)) sent_vec = paddle.layer.last_seq(gru) return sent_vec @@ -147,7 +157,6 @@ class DSSM(object): logger.info("create fc layer [%s] which dimention is %d" % (name, dim)) fc = paddle.layer.fc( - name=name, input=_input_layer, size=dim, act=paddle.activation.Tanh(), @@ -195,7 +204,7 @@ class DSSM(object): name='label_input', type=paddle.data_type.integer_value(1)) prefixs = '_ _ _'.split( - ) if self.share_semantic_generator else 'source left right'.split() + ) if self.share_semantic_generator else 'source target target'.split() embed_prefixs = '_ _'.split( ) if self.share_embed else 'source target target'.split() @@ -249,9 +258,9 @@ class DSSM(object): if is_classification else paddle.data_type.dense_vector(1)) prefixs = '_ _'.split( - ) if self.share_semantic_generator else 'left right'.split() + ) if self.share_semantic_generator else 'source target'.split() embed_prefixs = '_ _'.split( - ) if self.share_embed else 'left right'.split() + ) if self.share_embed else 'source target'.split() word_vecs = [] for id, input in enumerate([source, target]): diff --git a/dssm/train.py b/dssm/train.py index bc7685ab9632a7c34acdb2c0cacf7feec7699985..a76948774b8ffe5f80b8c6b6fdc139780b3b388e 100644 --- a/dssm/train.py +++ b/dssm/train.py @@ -1,4 +1,5 @@ import argparse +import distutils.util import paddle.v2 as paddle from network_conf import DSSM @@ -35,8 +36,8 @@ parser.add_argument( '-b', '--batch_size', type=int, - default=10, - help="size of mini-batch (default:10)") + default=32, + help="size of mini-batch (default:32)") parser.add_argument( '-p', '--num_passes', @@ -62,12 +63,12 @@ parser.add_argument( (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE)) parser.add_argument( '--share_network_between_source_target', - type=bool, + type=distutils.util.strtobool, default=False, help="whether to share network parameters between source and target") parser.add_argument( '--share_embed', - type=bool, + type=distutils.util.strtobool, default=False, help="whether to share word embedding between source and target") parser.add_argument( @@ -80,7 +81,7 @@ parser.add_argument( '--num_workers', type=int, default=1, help="num worker threads, default 1") parser.add_argument( '--use_gpu', - type=bool, + type=distutils.util.strtobool, default=False, help="whether to use GPU devices (default: False)") parser.add_argument(