From f3b1bb5ae98f921563ccc0218881e27dfb0bc6a9 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 9 Nov 2017 16:26:52 +0800 Subject: [PATCH] refine DSSM comments. --- dssm/infer.py | 65 ++++++++-------- dssm/network_conf.py | 182 ++++++++++++++++++++++--------------------- 2 files changed, 125 insertions(+), 122 deletions(-) diff --git a/dssm/infer.py b/dssm/infer.py index f0c65e44..393c2d93 100644 --- a/dssm/infer.py +++ b/dssm/infer.py @@ -9,25 +9,25 @@ from utils import logger, ModelType, ModelArch, load_dic parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer") parser.add_argument( - '--model_path', + "--model_path", type=str, required=True, help="path of model parameters file") parser.add_argument( - '-i', - '--data_path', + "-i", + "--data_path", type=str, required=True, help="path of the dataset to infer") parser.add_argument( - '-o', - '--prediction_output_path', + "-o", + "--prediction_output_path", type=str, required=True, help="path to output the prediction") parser.add_argument( - '-y', - '--model_type', + "-y", + "--model_type", type=int, required=True, default=ModelType.CLASSIFICATION_MODE, @@ -36,45 +36,45 @@ parser.add_argument( (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE, ModelType.REGRESSION_MODE)) parser.add_argument( - '-s', - '--source_dic_path', + "-s", + "--source_dic_path", type=str, required=False, help="path of the source's word dic") parser.add_argument( - '--target_dic_path', + "--target_dic_path", type=str, required=False, help=("path of the target's word dictionary, " "if not set, the `source_dic_path` will be used")) parser.add_argument( - '-a', - '--model_arch', + "-a", + "--model_arch", type=int, required=True, default=ModelArch.CNN_MODE, help="model architecture, %d for CNN, %d for FC, %d for RNN" % (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE)) parser.add_argument( - '--share_network_between_source_target', + "--share_network_between_source_target", type=distutils.util.strtobool, default=False, help="whether to share network parameters between source and target") parser.add_argument( - '--share_embed', + "--share_embed", type=distutils.util.strtobool, default=False, help="whether to share word embedding between source and target") parser.add_argument( - '--dnn_dims', + "--dnn_dims", type=str, - default='256,128,64,32', - help=("dimentions of dnn layers, default is '256,128,64,32', " + default="256,128,64,32", + help=("dimentions of dnn layers, default is `256,128,64,32`, " "which means create a 4-layer dnn, " "demention of each layer is 256, 128, 64 and 32")) parser.add_argument( - '-c', - '--class_num', + "-c", + "--class_num", type=int, default=0, help="number of categories for classification task.") @@ -83,9 +83,10 @@ args = parser.parse_args() args.model_type = ModelType(args.model_type) args.model_arch = ModelArch(args.model_arch) if args.model_type.is_classification(): - assert args.class_num > 1, "--class_num should be set in classification task." + assert args.class_num > 1, ("The parameter class_num should be set " + "in classification task.") -layer_dims = map(int, args.dnn_dims.split(',')) +layer_dims = map(int, args.dnn_dims.split(",")) args.target_dic_path = args.source_dic_path if not args.target_dic_path \ else args.target_dic_path @@ -94,8 +95,6 @@ paddle.init(use_gpu=False, trainer_count=1) class Inferer(object): def __init__(self, param_path): - logger.info("create DSSM model") - prediction = DSSM( dnn_dims=layer_dims, vocab_sizes=[ @@ -110,14 +109,13 @@ class Inferer(object): is_infer=True)() # load parameter - logger.info("load model parameters from %s" % param_path) + logger.info("Load the trained model from %s." % param_path) self.parameters = paddle.parameters.Parameters.from_tar( - open(param_path, 'r')) + open(param_path, "r")) self.inferer = paddle.inference.Inference( output_layer=prediction, parameters=self.parameters) def infer(self, data_path): - logger.info("infer data...") dataset = reader.Dataset( train_path=data_path, test_path=None, @@ -125,19 +123,20 @@ class Inferer(object): target_dic_path=args.target_dic_path, model_type=args.model_type, ) infer_reader = paddle.batch(dataset.infer, batch_size=1000) - logger.warning('write predictions to %s' % args.prediction_output_path) + logger.warning("Write predictions to %s." % args.prediction_output_path) - output_f = open(args.prediction_output_path, 'w') + output_f = open(args.prediction_output_path, "w") for id, batch in enumerate(infer_reader()): res = self.inferer.infer(input=batch) - predictions = [' '.join(map(str, x)) for x in res] + predictions = [" ".join(map(str, x)) for x in res] assert len(batch) == len(predictions), ( - "predict error, %d inputs, " - "but %d predictions") % (len(batch), len(predictions)) - output_f.write('\n'.join(map(str, predictions)) + '\n') + "Error! %d inputs are given, " + "but only %d predictions are returned.") % (len(batch), + len(predictions)) + output_f.write("\n".join(map(str, predictions)) + "\n") -if __name__ == '__main__': +if __name__ == "__main__": inferer = Inferer(args.model_path) inferer.infer(args.data_path) diff --git a/dssm/network_conf.py b/dssm/network_conf.py index 6888ca0e..135a00bf 100644 --- a/dssm/network_conf.py +++ b/dssm/network_conf.py @@ -13,26 +13,33 @@ class DSSM(object): class_num=None, share_embed=False, is_infer=False): - ''' - @dnn_dims: list of int - dimentions of each layer in semantic vector generator. - @vocab_sizes: 2-d tuple - size of both left and right items. - @model_type: int - type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2' - @model_arch: int - model architecture - @share_semantic_generator: bool - whether to share the semantic vector generator for both left and right. - @share_embed: bool - whether to share the embeddings between left and right. - @class_num: int - number of categories. - ''' + """ + :param dnn_dims: The dimention of each layer in the semantic vector + generator. + :type dnn_dims: list of int + :param vocab_sizes: The size of left and right items. + :type vocab_sizes: A list having 2 elements. + :param model_type: The type of task to train the DSSM model. The value + should be "rank: 0", "regression: 1" or + "classification: 2". + :type model_type: int + :param model_arch: A value indicating the model architecture to use. + :type model_arch: int + :param share_semantic_generator: A flag indicating whether to share the + semantic vector between the left and + the right item. + :type share_semantic_generator: bool + :param share_embed: A floag indicating whether to share the embeddings + between the left and the right item. + :type share_embed: bool + :param class_num: The number of categories. + :type class_num: int + """ assert len(vocab_sizes) == 2, ( - "vocab_sizes specify the sizes left and right inputs, " - "and dim should be 2.") - assert len(dnn_dims) > 1, "more than two layers is needed." + "The vocab_sizes specifying the sizes left and right inputs. " + "Its dimension should be 2.") + assert len(dnn_dims) > 1, ("In the DNN model, more than two layers " + "are needed.") self.dnn_dims = dnn_dims self.vocab_sizes = vocab_sizes @@ -42,91 +49,89 @@ class DSSM(object): self.model_arch = ModelArch(model_arch) self.class_num = class_num self.is_infer = is_infer - logger.warning("build DSSM model with config of %s, %s" % + logger.warning("Build DSSM model with config of %s, %s" % (self.model_type, self.model_arch)) - logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) + logger.info("The vocabulary size is : %s" % str(self.vocab_sizes)) # bind model architecture _model_arch = { - 'cnn': self.create_cnn, - 'fc': self.create_fc, - 'rnn': self.create_rnn, + "cnn": self.create_cnn, + "fc": self.create_fc, + "rnn": self.create_rnn, } - def _model_arch_creater(emb, prefix=''): + def _model_arch_creater(emb, prefix=""): sent_vec = _model_arch.get(str(model_arch))(emb, prefix) dnn = self.create_dnn(sent_vec, prefix) return dnn self.model_arch_creater = _model_arch_creater - # build model type _model_type = { - 'classification': self._build_classification_model, - 'rank': self._build_rank_model, - 'regression': self._build_regression_model, + "classification": self._build_classification_model, + "rank": self._build_rank_model, + "regression": self._build_regression_model, } - print 'model type: ', str(self.model_type) + print("model type: ", str(self.model_type)) self.model_type_creater = _model_type[str(self.model_type)] def __call__(self): return self.model_type_creater() - def create_embedding(self, input, prefix=''): - ''' - Create an embedding table whose name has a `prefix`. - ''' - logger.info("create embedding table [%s] which dimention is %d" % + def create_embedding(self, input, prefix=""): + """ + Create word embedding. The `prefix` is added in front of the name of + embedding"s learnable parameter. + """ + logger.info("Create embedding table [%s] whose dimention is %d. " % (prefix, self.dnn_dims[0])) emb = paddle.layer.embedding( input=input, size=self.dnn_dims[0], - param_attr=ParamAttr(name='%s_emb.w' % prefix)) + param_attr=ParamAttr(name="%s_emb.w" % prefix)) return emb - def create_fc(self, emb, prefix=''): - ''' + def create_fc(self, emb, prefix=""): + """ A multi-layer fully connected neural networks. - @emb: paddle.layer - output of the embedding layer - @prefix: str - prefix of layers' names, used to share parameters between - more than one `fc` parts. - ''' + :param emb: The output of the embedding layer + :type emb: paddle.layer + :param prefix: A prefix will be added to the layers' names. + :type prefix: str + """ _input_layer = paddle.layer.pooling( input=emb, pooling_type=paddle.pooling.Max()) fc = paddle.layer.fc( input=_input_layer, size=self.dnn_dims[1], - param_attr=ParamAttr(name='%s_fc.w' % prefix), + param_attr=ParamAttr(name="%s_fc.w" % prefix), bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.)) return fc - def create_rnn(self, emb, prefix=''): - ''' + def create_rnn(self, emb, prefix=""): + """ A GRU sentence vector learner. - ''' + """ gru = paddle.networks.simple_gru( input=emb, size=self.dnn_dims[1], - mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix), + mixed_param_attr=ParamAttr(name="%s_gru_mixed.w" % prefix), mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix), - gru_param_attr=ParamAttr(name='%s_gru.w' % prefix), + gru_param_attr=ParamAttr(name="%s_gru.w" % prefix), gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix)) sent_vec = paddle.layer.last_seq(gru) return sent_vec - def create_cnn(self, emb, prefix=''): - ''' + def create_cnn(self, emb, prefix=""): + """ A multi-layer CNN. - @emb: paddle.layer - output of the embedding layer - @prefix: str - prefix of layers' names, used to share parameters between - more than one `cnn` parts. - ''' + :param emb: The word embedding. + :type emb: paddle.layer + :param prefix: The prefix will be added to of layers' names. + :type prefix: str + """ def create_conv(context_len, hidden_size, prefix): key = "%s_%d_%d" % (prefix, context_len, hidden_size) @@ -135,15 +140,15 @@ class DSSM(object): context_len=context_len, hidden_size=hidden_size, # set parameter attr for parameter sharing - context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), - fc_param_attr=ParamAttr(name=key + '_fc.w'), - fc_bias_attr=ParamAttr(name=key + '_fc.b'), - pool_bias_attr=ParamAttr(name=key + '_pool.b')) + context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"), + fc_param_attr=ParamAttr(name=key + "_fc.w"), + fc_bias_attr=ParamAttr(name=key + "_fc.b"), + pool_bias_attr=ParamAttr(name=key + "_pool.b")) return conv - logger.info('create a sequence_conv_pool which context width is 3') + logger.info("create a sequence_conv_pool which context width is 3") conv_3 = create_conv(3, self.dnn_dims[1], "cnn") - logger.info('create a sequence_conv_pool which context width is 4') + logger.info("create a sequence_conv_pool which context width is 4") conv_4 = create_conv(4, self.dnn_dims[1], "cnn") return conv_3, conv_4 @@ -160,8 +165,8 @@ class DSSM(object): input=_input_layer, size=dim, act=paddle.activation.Tanh(), - param_attr=ParamAttr(name='%s.w' % name), - bias_attr=ParamAttr(name='%s.b' % name, initial_std=0.)) + param_attr=ParamAttr(name="%s.w" % name), + bias_attr=ParamAttr(name="%s.b" % name, initial_std=0.)) _input_layer = fc return _input_layer @@ -178,7 +183,7 @@ class DSSM(object): is_classification=False) def _build_rank_model(self): - ''' + """ Build a pairwise rank model, and the cost is returned. A pairwise rank model has 3 inputs: @@ -187,26 +192,26 @@ class DSSM(object): - right_target sentence - label, 1 if left_target should be sorted in front of right_target, otherwise 0. - ''' + """ logger.info("build rank model") assert self.model_type.is_rank() source = paddle.layer.data( - name='source_input', + name="source_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) left_target = paddle.layer.data( - name='left_target_input', + name="left_target_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) right_target = paddle.layer.data( - name='right_target_input', + name="right_target_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) if not self.is_infer: label = paddle.layer.data( - name='label_input', type=paddle.data_type.integer_value(1)) + name="label_input", type=paddle.data_type.integer_value(1)) - prefixs = '_ _ _'.split( - ) if self.share_semantic_generator else 'source target target'.split() - embed_prefixs = '_ _'.split( - ) if self.share_embed else 'source target target'.split() + prefixs = "_ _ _".split( + ) if self.share_semantic_generator else "source target target".split() + embed_prefixs = "_ _".split( + ) if self.share_embed else "source target target".split() word_vecs = [] for id, input in enumerate([source, left_target, right_target]): @@ -218,9 +223,9 @@ class DSSM(object): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) - # cossim score of source and left_target + # The cosine similarity score of source and left_target. left_score = paddle.layer.cos_sim(semantics[0], semantics[1]) - # cossim score of source and right target + # The cosine similarity score of source and right target. right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) if not self.is_infer: @@ -233,34 +238,33 @@ class DSSM(object): return right_score def _build_classification_or_regression_model(self, is_classification): - ''' + """ Build a classification/regression model, and the cost is returned. - A Classification has 3 inputs: + The classification/regression task expects 3 inputs: - source sentence - target sentence - classification label - ''' + """ if is_classification: - # prepare inputs. assert self.class_num source = paddle.layer.data( - name='source_input', + name="source_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) target = paddle.layer.data( - name='target_input', + name="target_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) label = paddle.layer.data( - name='label_input', + name="label_input", type=paddle.data_type.integer_value(self.class_num) if is_classification else paddle.data_type.dense_vector(1)) - prefixs = '_ _'.split( - ) if self.share_semantic_generator else 'source target'.split() - embed_prefixs = '_ _'.split( - ) if self.share_embed else 'source target'.split() + prefixs = "_ _".split( + ) if self.share_semantic_generator else "source target".split() + embed_prefixs = "_ _".split( + ) if self.share_embed else "source target".split() word_vecs = [] for id, input in enumerate([source, target]): -- GitLab