From f3b1bb5ae98f921563ccc0218881e27dfb0bc6a9 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Thu, 9 Nov 2017 16:26:52 +0800
Subject: [PATCH] refine DSSM comments.

---
 dssm/infer.py        |  65 ++++++++--------
 dssm/network_conf.py | 182 ++++++++++++++++++++++---------------------
 2 files changed, 125 insertions(+), 122 deletions(-)

diff --git a/dssm/infer.py b/dssm/infer.py
index f0c65e44..393c2d93 100644
--- a/dssm/infer.py
+++ b/dssm/infer.py
@@ -9,25 +9,25 @@ from utils import logger, ModelType, ModelArch, load_dic
 
 parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
 parser.add_argument(
-    '--model_path',
+    "--model_path",
     type=str,
     required=True,
     help="path of model parameters file")
 parser.add_argument(
-    '-i',
-    '--data_path',
+    "-i",
+    "--data_path",
     type=str,
     required=True,
     help="path of the dataset to infer")
 parser.add_argument(
-    '-o',
-    '--prediction_output_path',
+    "-o",
+    "--prediction_output_path",
     type=str,
     required=True,
     help="path to output the prediction")
 parser.add_argument(
-    '-y',
-    '--model_type',
+    "-y",
+    "--model_type",
     type=int,
     required=True,
     default=ModelType.CLASSIFICATION_MODE,
@@ -36,45 +36,45 @@ parser.add_argument(
     (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
      ModelType.REGRESSION_MODE))
 parser.add_argument(
-    '-s',
-    '--source_dic_path',
+    "-s",
+    "--source_dic_path",
     type=str,
     required=False,
     help="path of the source's word dic")
 parser.add_argument(
-    '--target_dic_path',
+    "--target_dic_path",
     type=str,
     required=False,
     help=("path of the target's word dictionary, "
           "if not set, the `source_dic_path` will be used"))
 parser.add_argument(
-    '-a',
-    '--model_arch',
+    "-a",
+    "--model_arch",
     type=int,
     required=True,
     default=ModelArch.CNN_MODE,
     help="model architecture, %d for CNN, %d for FC, %d for RNN" %
     (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
 parser.add_argument(
-    '--share_network_between_source_target',
+    "--share_network_between_source_target",
     type=distutils.util.strtobool,
     default=False,
     help="whether to share network parameters between source and target")
 parser.add_argument(
-    '--share_embed',
+    "--share_embed",
     type=distutils.util.strtobool,
     default=False,
     help="whether to share word embedding between source and target")
 parser.add_argument(
-    '--dnn_dims',
+    "--dnn_dims",
     type=str,
-    default='256,128,64,32',
-    help=("dimentions of dnn layers, default is '256,128,64,32', "
+    default="256,128,64,32",
+    help=("dimentions of dnn layers, default is `256,128,64,32`, "
           "which means create a 4-layer dnn, "
           "demention of each layer is 256, 128, 64 and 32"))
 parser.add_argument(
-    '-c',
-    '--class_num',
+    "-c",
+    "--class_num",
     type=int,
     default=0,
     help="number of categories for classification task.")
@@ -83,9 +83,10 @@ args = parser.parse_args()
 args.model_type = ModelType(args.model_type)
 args.model_arch = ModelArch(args.model_arch)
 if args.model_type.is_classification():
-    assert args.class_num > 1, "--class_num should be set in classification task."
+    assert args.class_num > 1, ("The parameter class_num should be set "
+                                "in classification task.")
 
-layer_dims = map(int, args.dnn_dims.split(','))
+layer_dims = map(int, args.dnn_dims.split(","))
 args.target_dic_path = args.source_dic_path if not args.target_dic_path \
         else args.target_dic_path
 
@@ -94,8 +95,6 @@ paddle.init(use_gpu=False, trainer_count=1)
 
 class Inferer(object):
     def __init__(self, param_path):
-        logger.info("create DSSM model")
-
         prediction = DSSM(
             dnn_dims=layer_dims,
             vocab_sizes=[
@@ -110,14 +109,13 @@ class Inferer(object):
             is_infer=True)()
 
         # load parameter
-        logger.info("load model parameters from %s" % param_path)
+        logger.info("Load the trained model from %s." % param_path)
         self.parameters = paddle.parameters.Parameters.from_tar(
-            open(param_path, 'r'))
+            open(param_path, "r"))
         self.inferer = paddle.inference.Inference(
             output_layer=prediction, parameters=self.parameters)
 
     def infer(self, data_path):
-        logger.info("infer data...")
         dataset = reader.Dataset(
             train_path=data_path,
             test_path=None,
@@ -125,19 +123,20 @@ class Inferer(object):
             target_dic_path=args.target_dic_path,
             model_type=args.model_type, )
         infer_reader = paddle.batch(dataset.infer, batch_size=1000)
-        logger.warning('write predictions to %s' % args.prediction_output_path)
+        logger.warning("Write predictions to %s." % args.prediction_output_path)
 
-        output_f = open(args.prediction_output_path, 'w')
+        output_f = open(args.prediction_output_path, "w")
 
         for id, batch in enumerate(infer_reader()):
             res = self.inferer.infer(input=batch)
-            predictions = [' '.join(map(str, x)) for x in res]
+            predictions = [" ".join(map(str, x)) for x in res]
             assert len(batch) == len(predictions), (
-                "predict error, %d inputs, "
-                "but %d predictions") % (len(batch), len(predictions))
-            output_f.write('\n'.join(map(str, predictions)) + '\n')
+                "Error! %d inputs are given, "
+                "but only %d predictions are returned.") % (len(batch),
+                                                            len(predictions))
+            output_f.write("\n".join(map(str, predictions)) + "\n")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     inferer = Inferer(args.model_path)
     inferer.infer(args.data_path)
diff --git a/dssm/network_conf.py b/dssm/network_conf.py
index 6888ca0e..135a00bf 100644
--- a/dssm/network_conf.py
+++ b/dssm/network_conf.py
@@ -13,26 +13,33 @@ class DSSM(object):
                  class_num=None,
                  share_embed=False,
                  is_infer=False):
-        '''
-        @dnn_dims: list of int
-            dimentions of each layer in semantic vector generator.
-        @vocab_sizes: 2-d tuple
-            size of both left and right items.
-        @model_type: int
-            type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
-        @model_arch: int
-            model architecture
-        @share_semantic_generator: bool
-            whether to share the semantic vector generator for both left and right.
-        @share_embed: bool
-            whether to share the embeddings between left and right.
-        @class_num: int
-            number of categories.
-        '''
+        """
+        :param dnn_dims: The dimention of each layer in the semantic vector
+                         generator.
+        :type dnn_dims: list of int
+        :param vocab_sizes: The size of left and right items.
+        :type vocab_sizes: A list having 2 elements.
+        :param model_type: The type of task to train the DSSM model. The value
+                           should be "rank: 0", "regression: 1" or
+                           "classification: 2".
+        :type model_type: int
+        :param model_arch: A value indicating the model architecture to use.
+        :type model_arch: int
+        :param share_semantic_generator: A flag indicating whether to share the
+                                         semantic vector between the left and
+                                         the right item.
+        :type share_semantic_generator: bool
+        :param share_embed: A floag indicating whether to share the embeddings
+                            between the left and the right item.
+        :type share_embed: bool
+        :param class_num: The number of categories.
+        :type class_num: int
+        """
         assert len(vocab_sizes) == 2, (
-            "vocab_sizes specify the sizes left and right inputs, "
-            "and dim should be 2.")
-        assert len(dnn_dims) > 1, "more than two layers is needed."
+            "The vocab_sizes specifying the sizes left and right inputs. "
+            "Its dimension should be 2.")
+        assert len(dnn_dims) > 1, ("In the DNN model, more than two layers "
+                                   "are needed.")
 
         self.dnn_dims = dnn_dims
         self.vocab_sizes = vocab_sizes
@@ -42,91 +49,89 @@ class DSSM(object):
         self.model_arch = ModelArch(model_arch)
         self.class_num = class_num
         self.is_infer = is_infer
-        logger.warning("build DSSM model with config of %s, %s" %
+        logger.warning("Build DSSM model with config of %s, %s" %
                        (self.model_type, self.model_arch))
-        logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
+        logger.info("The vocabulary size is : %s" % str(self.vocab_sizes))
 
         # bind model architecture
         _model_arch = {
-            'cnn': self.create_cnn,
-            'fc': self.create_fc,
-            'rnn': self.create_rnn,
+            "cnn": self.create_cnn,
+            "fc": self.create_fc,
+            "rnn": self.create_rnn,
         }
 
-        def _model_arch_creater(emb, prefix=''):
+        def _model_arch_creater(emb, prefix=""):
             sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
             dnn = self.create_dnn(sent_vec, prefix)
             return dnn
 
         self.model_arch_creater = _model_arch_creater
 
-        # build model type
         _model_type = {
-            'classification': self._build_classification_model,
-            'rank': self._build_rank_model,
-            'regression': self._build_regression_model,
+            "classification": self._build_classification_model,
+            "rank": self._build_rank_model,
+            "regression": self._build_regression_model,
         }
-        print 'model type: ', str(self.model_type)
+        print("model type: ", str(self.model_type))
         self.model_type_creater = _model_type[str(self.model_type)]
 
     def __call__(self):
         return self.model_type_creater()
 
-    def create_embedding(self, input, prefix=''):
-        '''
-        Create an embedding table whose name has a `prefix`.
-        '''
-        logger.info("create embedding table [%s] which dimention is %d" %
+    def create_embedding(self, input, prefix=""):
+        """
+        Create word embedding. The `prefix` is added in front of the name of
+        embedding"s learnable parameter.
+        """
+        logger.info("Create embedding table [%s] whose dimention is %d. " %
                     (prefix, self.dnn_dims[0]))
         emb = paddle.layer.embedding(
             input=input,
             size=self.dnn_dims[0],
-            param_attr=ParamAttr(name='%s_emb.w' % prefix))
+            param_attr=ParamAttr(name="%s_emb.w" % prefix))
         return emb
 
-    def create_fc(self, emb, prefix=''):
-        '''
+    def create_fc(self, emb, prefix=""):
+        """
         A multi-layer fully connected neural networks.
 
-        @emb: paddle.layer
-            output of the embedding layer
-        @prefix: str
-            prefix of layers' names, used to share parameters between
-            more than one `fc` parts.
-        '''
+        :param emb: The output of the embedding layer
+        :type emb: paddle.layer
+        :param prefix: A prefix will be added to the layers' names.
+        :type prefix: str
+        """
         _input_layer = paddle.layer.pooling(
             input=emb, pooling_type=paddle.pooling.Max())
         fc = paddle.layer.fc(
             input=_input_layer,
             size=self.dnn_dims[1],
-            param_attr=ParamAttr(name='%s_fc.w' % prefix),
+            param_attr=ParamAttr(name="%s_fc.w" % prefix),
             bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.))
         return fc
 
-    def create_rnn(self, emb, prefix=''):
-        '''
+    def create_rnn(self, emb, prefix=""):
+        """
         A GRU sentence vector learner.
-        '''
+        """
         gru = paddle.networks.simple_gru(
             input=emb,
             size=self.dnn_dims[1],
-            mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix),
+            mixed_param_attr=ParamAttr(name="%s_gru_mixed.w" % prefix),
             mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix),
-            gru_param_attr=ParamAttr(name='%s_gru.w' % prefix),
+            gru_param_attr=ParamAttr(name="%s_gru.w" % prefix),
             gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix))
         sent_vec = paddle.layer.last_seq(gru)
         return sent_vec
 
-    def create_cnn(self, emb, prefix=''):
-        '''
+    def create_cnn(self, emb, prefix=""):
+        """
         A multi-layer CNN.
 
-        @emb: paddle.layer
-            output of the embedding layer
-        @prefix: str
-            prefix of layers' names, used to share parameters between
-            more than one `cnn` parts.
-        '''
+        :param emb: The word embedding.
+        :type emb: paddle.layer
+        :param prefix: The prefix will be added to of layers' names.
+        :type prefix: str
+        """
 
         def create_conv(context_len, hidden_size, prefix):
             key = "%s_%d_%d" % (prefix, context_len, hidden_size)
@@ -135,15 +140,15 @@ class DSSM(object):
                 context_len=context_len,
                 hidden_size=hidden_size,
                 # set parameter attr for parameter sharing
-                context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
-                fc_param_attr=ParamAttr(name=key + '_fc.w'),
-                fc_bias_attr=ParamAttr(name=key + '_fc.b'),
-                pool_bias_attr=ParamAttr(name=key + '_pool.b'))
+                context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
+                fc_param_attr=ParamAttr(name=key + "_fc.w"),
+                fc_bias_attr=ParamAttr(name=key + "_fc.b"),
+                pool_bias_attr=ParamAttr(name=key + "_pool.b"))
             return conv
 
-        logger.info('create a sequence_conv_pool which context width is 3')
+        logger.info("create a sequence_conv_pool which context width is 3")
         conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
-        logger.info('create a sequence_conv_pool which context width is 4')
+        logger.info("create a sequence_conv_pool which context width is 4")
         conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
 
         return conv_3, conv_4
@@ -160,8 +165,8 @@ class DSSM(object):
                     input=_input_layer,
                     size=dim,
                     act=paddle.activation.Tanh(),
-                    param_attr=ParamAttr(name='%s.w' % name),
-                    bias_attr=ParamAttr(name='%s.b' % name, initial_std=0.))
+                    param_attr=ParamAttr(name="%s.w" % name),
+                    bias_attr=ParamAttr(name="%s.b" % name, initial_std=0.))
                 _input_layer = fc
         return _input_layer
 
@@ -178,7 +183,7 @@ class DSSM(object):
             is_classification=False)
 
     def _build_rank_model(self):
-        '''
+        """
         Build a pairwise rank model, and the cost is returned.
 
         A pairwise rank model has 3 inputs:
@@ -187,26 +192,26 @@ class DSSM(object):
           - right_target sentence
           - label, 1 if left_target should be sorted in front of
                    right_target, otherwise 0.
-        '''
+        """
         logger.info("build rank model")
         assert self.model_type.is_rank()
         source = paddle.layer.data(
-            name='source_input',
+            name="source_input",
             type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
         left_target = paddle.layer.data(
-            name='left_target_input',
+            name="left_target_input",
             type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
         right_target = paddle.layer.data(
-            name='right_target_input',
+            name="right_target_input",
             type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
         if not self.is_infer:
             label = paddle.layer.data(
-                name='label_input', type=paddle.data_type.integer_value(1))
+                name="label_input", type=paddle.data_type.integer_value(1))
 
-        prefixs = '_ _ _'.split(
-        ) if self.share_semantic_generator else 'source target target'.split()
-        embed_prefixs = '_ _'.split(
-        ) if self.share_embed else 'source target target'.split()
+        prefixs = "_ _ _".split(
+        ) if self.share_semantic_generator else "source target target".split()
+        embed_prefixs = "_ _".split(
+        ) if self.share_embed else "source target target".split()
 
         word_vecs = []
         for id, input in enumerate([source, left_target, right_target]):
@@ -218,9 +223,9 @@ class DSSM(object):
             x = self.model_arch_creater(input, prefix=prefixs[id])
             semantics.append(x)
 
-        # cossim score of source and left_target
+        # The cosine similarity score of source and left_target.
         left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
-        # cossim score of source and right target
+        # The cosine similarity score of source and right target.
         right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
 
         if not self.is_infer:
@@ -233,34 +238,33 @@ class DSSM(object):
         return right_score
 
     def _build_classification_or_regression_model(self, is_classification):
-        '''
+        """
         Build a classification/regression model, and the cost is returned.
 
-        A Classification has 3 inputs:
+        The classification/regression task expects 3 inputs:
           - source sentence
           - target sentence
           - classification label
 
-        '''
+        """
         if is_classification:
-            # prepare inputs.
             assert self.class_num
 
         source = paddle.layer.data(
-            name='source_input',
+            name="source_input",
             type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
         target = paddle.layer.data(
-            name='target_input',
+            name="target_input",
             type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
         label = paddle.layer.data(
-            name='label_input',
+            name="label_input",
             type=paddle.data_type.integer_value(self.class_num)
             if is_classification else paddle.data_type.dense_vector(1))
 
-        prefixs = '_ _'.split(
-        ) if self.share_semantic_generator else 'source target'.split()
-        embed_prefixs = '_ _'.split(
-        ) if self.share_embed else 'source target'.split()
+        prefixs = "_ _".split(
+        ) if self.share_semantic_generator else "source target".split()
+        embed_prefixs = "_ _".split(
+        ) if self.share_embed else "source target".split()
 
         word_vecs = []
         for id, input in enumerate([source, target]):
-- 
GitLab