From c7008f5c364e8d271b6dc6123fc3e74d482351c9 Mon Sep 17 00:00:00 2001
From: ranqiu <ranqiu@baidu.com>
Date: Wed, 1 Nov 2017 11:12:38 +0800
Subject: [PATCH] Refine code and doc of DSSM

---
 dssm/README.md       | 48 ++++++++++++++++++++++++++++----------------
 dssm/index.html      | 48 ++++++++++++++++++++++++++++----------------
 dssm/network_conf.py |  4 ++--
 dssm/train.py        |  4 ++--
 4 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/dssm/README.md b/dssm/README.md
index ab9d4a1b..2d5e0eff 100644
--- a/dssm/README.md
+++ b/dssm/README.md
@@ -121,7 +121,13 @@ def create_rnn(self, emb, prefix=''):
     '''
     A GRU sentence vector learner.
     '''
-    gru = paddle.layer.gru_memory(input=emb,)
+    gru = paddle.networks.simple_gru(
+        input=emb,
+        size=self.dnn_dims[1],
+        mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix),
+        mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix),
+        gru_param_attr=ParamAttr(name='%s_gru.w' % prefix),
+        gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix))
     sent_vec = paddle.layer.last_seq(gru)
     return sent_vec
 ```
@@ -140,7 +146,11 @@ def create_fc(self, emb, prefix=''):
     '''
     _input_layer = paddle.layer.pooling(
         input=emb, pooling_type=paddle.pooling.Max())
-    fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1])
+    fc = paddle.layer.fc(
+        input=_input_layer,
+        size=self.dnn_dims[1],
+        param_attr=ParamAttr(name='%s_fc.w' % prefix),
+        bias_attr=ParamAttr(name="%s_fc.b" % prefix))
     return fc
 ```
 
@@ -160,7 +170,6 @@ def create_dnn(self, sent_vec, prefix):
             fc = paddle.layer.fc(
                 input=_input_layer,
                 size=dim,
-                name=name,
                 act=paddle.activation.Tanh(),
                 param_attr=ParamAttr(name='%s.w' % name),
                 bias_attr=ParamAttr(name='%s.b' % name),
@@ -198,9 +207,9 @@ def _build_classification_or_regression_model(self, is_classification):
         if is_classification else paddle.data_type.dense_input)
 
     prefixs = '_ _'.split(
-    ) if self.share_semantic_generator else 'left right'.split()
+    ) if self.share_semantic_generator else 'source target'.split()
     embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'left right'.split()
+    ) if self.share_embed else 'source target'.split()
 
     word_vecs = []
     for id, input in enumerate([source, target]):
@@ -212,16 +221,21 @@ def _build_classification_or_regression_model(self, is_classification):
         x = self.model_arch_creater(input, prefix=prefixs[id])
         semantics.append(x)
 
-    concated_vector = paddle.layer.concat(semantics)
-    prediction = paddle.layer.fc(
-        input=concated_vector,
-        size=self.class_num,
-        act=paddle.activation.Softmax())
-    cost = paddle.layer.classification_cost(
-        input=prediction,
-        label=label) if is_classification else paddle.layer.mse_cost(
-            prediction, label)
-    return cost, prediction, label
+    if is_classification:
+        concated_vector = paddle.layer.concat(semantics)
+        prediction = paddle.layer.fc(
+            input=concated_vector,
+            size=self.class_num,
+            act=paddle.activation.Softmax())
+        cost = paddle.layer.classification_cost(
+            input=prediction, label=label)
+    else:
+        prediction = paddle.layer.cos_sim(*semantics)
+        cost = paddle.layer.square_error_cost(prediction, label)
+
+    if not self.is_infer:
+        return cost, prediction, label
+    return prediction
 ```
 
 ### Pairwise Rank
@@ -251,7 +265,7 @@ def _build_rank_model(self):
         name='label_input', type=paddle.data_type.integer_value(1))
 
     prefixs = '_ _ _'.split(
-    ) if self.share_semantic_generator else 'source left right'.split()
+    ) if self.share_semantic_generator else 'source target target'.split()
     embed_prefixs = '_ _'.split(
     ) if self.share_embed else 'source target target'.split()
 
@@ -361,7 +375,7 @@ optional arguments:
                         path of the target's word dic, if not set, the
                         `source_dic_path` will be used
   -b BATCH_SIZE, --batch_size BATCH_SIZE
-                        size of mini-batch (default:10)
+                        size of mini-batch (default:32)
   -p NUM_PASSES, --num_passes NUM_PASSES
                         number of passes to run(default:10)
   -y MODEL_TYPE, --model_type MODEL_TYPE
diff --git a/dssm/index.html b/dssm/index.html
index 2231c012..b4777a28 100644
--- a/dssm/index.html
+++ b/dssm/index.html
@@ -163,7 +163,13 @@ def create_rnn(self, emb, prefix=''):
     '''
     A GRU sentence vector learner.
     '''
-    gru = paddle.layer.gru_memory(input=emb,)
+    gru = paddle.networks.simple_gru(
+        input=emb,
+        size=self.dnn_dims[1],
+        mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix),
+        mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix),
+        gru_param_attr=ParamAttr(name='%s_gru.w' % prefix),
+        gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix))
     sent_vec = paddle.layer.last_seq(gru)
     return sent_vec
 ```
@@ -182,7 +188,11 @@ def create_fc(self, emb, prefix=''):
     '''
     _input_layer = paddle.layer.pooling(
         input=emb, pooling_type=paddle.pooling.Max())
-    fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1])
+    fc = paddle.layer.fc(
+        input=_input_layer,
+        size=self.dnn_dims[1],
+        param_attr=ParamAttr(name='%s_fc.w' % prefix),
+        bias_attr=ParamAttr(name="%s_fc.b" % prefix))
     return fc
 ```
 
@@ -202,7 +212,6 @@ def create_dnn(self, sent_vec, prefix):
             fc = paddle.layer.fc(
                 input=_input_layer,
                 size=dim,
-                name=name,
                 act=paddle.activation.Tanh(),
                 param_attr=ParamAttr(name='%s.w' % name),
                 bias_attr=ParamAttr(name='%s.b' % name),
@@ -240,9 +249,9 @@ def _build_classification_or_regression_model(self, is_classification):
         if is_classification else paddle.data_type.dense_input)
 
     prefixs = '_ _'.split(
-    ) if self.share_semantic_generator else 'left right'.split()
+    ) if self.share_semantic_generator else 'source target'.split()
     embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'left right'.split()
+    ) if self.share_embed else 'source target'.split()
 
     word_vecs = []
     for id, input in enumerate([source, target]):
@@ -254,16 +263,21 @@ def _build_classification_or_regression_model(self, is_classification):
         x = self.model_arch_creater(input, prefix=prefixs[id])
         semantics.append(x)
 
-    concated_vector = paddle.layer.concat(semantics)
-    prediction = paddle.layer.fc(
-        input=concated_vector,
-        size=self.class_num,
-        act=paddle.activation.Softmax())
-    cost = paddle.layer.classification_cost(
-        input=prediction,
-        label=label) if is_classification else paddle.layer.mse_cost(
-            prediction, label)
-    return cost, prediction, label
+    if is_classification:
+        concated_vector = paddle.layer.concat(semantics)
+        prediction = paddle.layer.fc(
+            input=concated_vector,
+            size=self.class_num,
+            act=paddle.activation.Softmax())
+        cost = paddle.layer.classification_cost(
+            input=prediction, label=label)
+    else:
+        prediction = paddle.layer.cos_sim(*semantics)
+        cost = paddle.layer.square_error_cost(prediction, label)
+
+    if not self.is_infer:
+        return cost, prediction, label
+    return prediction
 ```
 
 ### Pairwise Rank
@@ -293,7 +307,7 @@ def _build_rank_model(self):
         name='label_input', type=paddle.data_type.integer_value(1))
 
     prefixs = '_ _ _'.split(
-    ) if self.share_semantic_generator else 'source left right'.split()
+    ) if self.share_semantic_generator else 'source target target'.split()
     embed_prefixs = '_ _'.split(
     ) if self.share_embed else 'source target target'.split()
 
@@ -403,7 +417,7 @@ optional arguments:
                         path of the target's word dic, if not set, the
                         `source_dic_path` will be used
   -b BATCH_SIZE, --batch_size BATCH_SIZE
-                        size of mini-batch (default:10)
+                        size of mini-batch (default:32)
   -p NUM_PASSES, --num_passes NUM_PASSES
                         number of passes to run(default:10)
   -y MODEL_TYPE, --model_type MODEL_TYPE
diff --git a/dssm/network_conf.py b/dssm/network_conf.py
index 8e45ef81..6888ca0e 100644
--- a/dssm/network_conf.py
+++ b/dssm/network_conf.py
@@ -100,7 +100,7 @@ class DSSM(object):
             input=_input_layer,
             size=self.dnn_dims[1],
             param_attr=ParamAttr(name='%s_fc.w' % prefix),
-            bias_attr=ParamAttr(name="%s_fc.b" % prefix))
+            bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.))
         return fc
 
     def create_rnn(self, emb, prefix=''):
@@ -161,7 +161,7 @@ class DSSM(object):
                     size=dim,
                     act=paddle.activation.Tanh(),
                     param_attr=ParamAttr(name='%s.w' % name),
-                    bias_attr=ParamAttr(name='%s.b' % name))
+                    bias_attr=ParamAttr(name='%s.b' % name, initial_std=0.))
                 _input_layer = fc
         return _input_layer
 
diff --git a/dssm/train.py b/dssm/train.py
index a7694877..eb563d1d 100644
--- a/dssm/train.py
+++ b/dssm/train.py
@@ -131,7 +131,7 @@ def train(train_data_path=None,
           target_dic_path=None,
           model_type=ModelType.create_classification(),
           model_arch=ModelArch.create_cnn(),
-          batch_size=10,
+          batch_size=32,
           num_passes=10,
           share_semantic_generator=False,
           share_embed=False,
@@ -187,7 +187,7 @@ def train(train_data_path=None,
     parameters = paddle.parameters.create(cost)
 
     adam_optimizer = paddle.optimizer.Adam(
-        learning_rate=1e-3,
+        learning_rate=2e-4,
         regularization=paddle.optimizer.L2Regularization(rate=1e-3),
         model_average=paddle.optimizer.ModelAverage(average_window=0.5))
 
-- 
GitLab