From c7008f5c364e8d271b6dc6123fc3e74d482351c9 Mon Sep 17 00:00:00 2001 From: ranqiu Date: Wed, 1 Nov 2017 11:12:38 +0800 Subject: [PATCH] Refine code and doc of DSSM --- dssm/README.md | 48 ++++++++++++++++++++++++++++---------------- dssm/index.html | 48 ++++++++++++++++++++++++++++---------------- dssm/network_conf.py | 4 ++-- dssm/train.py | 4 ++-- 4 files changed, 66 insertions(+), 38 deletions(-) diff --git a/dssm/README.md b/dssm/README.md index ab9d4a1b..2d5e0eff 100644 --- a/dssm/README.md +++ b/dssm/README.md @@ -121,7 +121,13 @@ def create_rnn(self, emb, prefix=''): ''' A GRU sentence vector learner. ''' - gru = paddle.layer.gru_memory(input=emb,) + gru = paddle.networks.simple_gru( + input=emb, + size=self.dnn_dims[1], + mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix), + mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix), + gru_param_attr=ParamAttr(name='%s_gru.w' % prefix), + gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix)) sent_vec = paddle.layer.last_seq(gru) return sent_vec ``` @@ -140,7 +146,11 @@ def create_fc(self, emb, prefix=''): ''' _input_layer = paddle.layer.pooling( input=emb, pooling_type=paddle.pooling.Max()) - fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1]) + fc = paddle.layer.fc( + input=_input_layer, + size=self.dnn_dims[1], + param_attr=ParamAttr(name='%s_fc.w' % prefix), + bias_attr=ParamAttr(name="%s_fc.b" % prefix)) return fc ``` @@ -160,7 +170,6 @@ def create_dnn(self, sent_vec, prefix): fc = paddle.layer.fc( input=_input_layer, size=dim, - name=name, act=paddle.activation.Tanh(), param_attr=ParamAttr(name='%s.w' % name), bias_attr=ParamAttr(name='%s.b' % name), @@ -198,9 +207,9 @@ def _build_classification_or_regression_model(self, is_classification): if is_classification else paddle.data_type.dense_input) prefixs = '_ _'.split( - ) if self.share_semantic_generator else 'left right'.split() + ) if self.share_semantic_generator else 'source target'.split() embed_prefixs = '_ _'.split( - ) if self.share_embed else 'left right'.split() + ) if self.share_embed else 'source target'.split() word_vecs = [] for id, input in enumerate([source, target]): @@ -212,16 +221,21 @@ def _build_classification_or_regression_model(self, is_classification): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) - concated_vector = paddle.layer.concat(semantics) - prediction = paddle.layer.fc( - input=concated_vector, - size=self.class_num, - act=paddle.activation.Softmax()) - cost = paddle.layer.classification_cost( - input=prediction, - label=label) if is_classification else paddle.layer.mse_cost( - prediction, label) - return cost, prediction, label + if is_classification: + concated_vector = paddle.layer.concat(semantics) + prediction = paddle.layer.fc( + input=concated_vector, + size=self.class_num, + act=paddle.activation.Softmax()) + cost = paddle.layer.classification_cost( + input=prediction, label=label) + else: + prediction = paddle.layer.cos_sim(*semantics) + cost = paddle.layer.square_error_cost(prediction, label) + + if not self.is_infer: + return cost, prediction, label + return prediction ``` ### Pairwise Rank @@ -251,7 +265,7 @@ def _build_rank_model(self): name='label_input', type=paddle.data_type.integer_value(1)) prefixs = '_ _ _'.split( - ) if self.share_semantic_generator else 'source left right'.split() + ) if self.share_semantic_generator else 'source target target'.split() embed_prefixs = '_ _'.split( ) if self.share_embed else 'source target target'.split() @@ -361,7 +375,7 @@ optional arguments: path of the target's word dic, if not set, the `source_dic_path` will be used -b BATCH_SIZE, --batch_size BATCH_SIZE - size of mini-batch (default:10) + size of mini-batch (default:32) -p NUM_PASSES, --num_passes NUM_PASSES number of passes to run(default:10) -y MODEL_TYPE, --model_type MODEL_TYPE diff --git a/dssm/index.html b/dssm/index.html index 2231c012..b4777a28 100644 --- a/dssm/index.html +++ b/dssm/index.html @@ -163,7 +163,13 @@ def create_rnn(self, emb, prefix=''): ''' A GRU sentence vector learner. ''' - gru = paddle.layer.gru_memory(input=emb,) + gru = paddle.networks.simple_gru( + input=emb, + size=self.dnn_dims[1], + mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix), + mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix), + gru_param_attr=ParamAttr(name='%s_gru.w' % prefix), + gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix)) sent_vec = paddle.layer.last_seq(gru) return sent_vec ``` @@ -182,7 +188,11 @@ def create_fc(self, emb, prefix=''): ''' _input_layer = paddle.layer.pooling( input=emb, pooling_type=paddle.pooling.Max()) - fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1]) + fc = paddle.layer.fc( + input=_input_layer, + size=self.dnn_dims[1], + param_attr=ParamAttr(name='%s_fc.w' % prefix), + bias_attr=ParamAttr(name="%s_fc.b" % prefix)) return fc ``` @@ -202,7 +212,6 @@ def create_dnn(self, sent_vec, prefix): fc = paddle.layer.fc( input=_input_layer, size=dim, - name=name, act=paddle.activation.Tanh(), param_attr=ParamAttr(name='%s.w' % name), bias_attr=ParamAttr(name='%s.b' % name), @@ -240,9 +249,9 @@ def _build_classification_or_regression_model(self, is_classification): if is_classification else paddle.data_type.dense_input) prefixs = '_ _'.split( - ) if self.share_semantic_generator else 'left right'.split() + ) if self.share_semantic_generator else 'source target'.split() embed_prefixs = '_ _'.split( - ) if self.share_embed else 'left right'.split() + ) if self.share_embed else 'source target'.split() word_vecs = [] for id, input in enumerate([source, target]): @@ -254,16 +263,21 @@ def _build_classification_or_regression_model(self, is_classification): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) - concated_vector = paddle.layer.concat(semantics) - prediction = paddle.layer.fc( - input=concated_vector, - size=self.class_num, - act=paddle.activation.Softmax()) - cost = paddle.layer.classification_cost( - input=prediction, - label=label) if is_classification else paddle.layer.mse_cost( - prediction, label) - return cost, prediction, label + if is_classification: + concated_vector = paddle.layer.concat(semantics) + prediction = paddle.layer.fc( + input=concated_vector, + size=self.class_num, + act=paddle.activation.Softmax()) + cost = paddle.layer.classification_cost( + input=prediction, label=label) + else: + prediction = paddle.layer.cos_sim(*semantics) + cost = paddle.layer.square_error_cost(prediction, label) + + if not self.is_infer: + return cost, prediction, label + return prediction ``` ### Pairwise Rank @@ -293,7 +307,7 @@ def _build_rank_model(self): name='label_input', type=paddle.data_type.integer_value(1)) prefixs = '_ _ _'.split( - ) if self.share_semantic_generator else 'source left right'.split() + ) if self.share_semantic_generator else 'source target target'.split() embed_prefixs = '_ _'.split( ) if self.share_embed else 'source target target'.split() @@ -403,7 +417,7 @@ optional arguments: path of the target's word dic, if not set, the `source_dic_path` will be used -b BATCH_SIZE, --batch_size BATCH_SIZE - size of mini-batch (default:10) + size of mini-batch (default:32) -p NUM_PASSES, --num_passes NUM_PASSES number of passes to run(default:10) -y MODEL_TYPE, --model_type MODEL_TYPE diff --git a/dssm/network_conf.py b/dssm/network_conf.py index 8e45ef81..6888ca0e 100644 --- a/dssm/network_conf.py +++ b/dssm/network_conf.py @@ -100,7 +100,7 @@ class DSSM(object): input=_input_layer, size=self.dnn_dims[1], param_attr=ParamAttr(name='%s_fc.w' % prefix), - bias_attr=ParamAttr(name="%s_fc.b" % prefix)) + bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.)) return fc def create_rnn(self, emb, prefix=''): @@ -161,7 +161,7 @@ class DSSM(object): size=dim, act=paddle.activation.Tanh(), param_attr=ParamAttr(name='%s.w' % name), - bias_attr=ParamAttr(name='%s.b' % name)) + bias_attr=ParamAttr(name='%s.b' % name, initial_std=0.)) _input_layer = fc return _input_layer diff --git a/dssm/train.py b/dssm/train.py index a7694877..eb563d1d 100644 --- a/dssm/train.py +++ b/dssm/train.py @@ -131,7 +131,7 @@ def train(train_data_path=None, target_dic_path=None, model_type=ModelType.create_classification(), model_arch=ModelArch.create_cnn(), - batch_size=10, + batch_size=32, num_passes=10, share_semantic_generator=False, share_embed=False, @@ -187,7 +187,7 @@ def train(train_data_path=None, parameters = paddle.parameters.create(cost) adam_optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, + learning_rate=2e-4, regularization=paddle.optimizer.L2Regularization(rate=1e-3), model_average=paddle.optimizer.ModelAverage(average_window=0.5)) -- GitLab