dssm model ready

7d6f6d74 · Superjom · 5c5a64a3 · 7d6f6d74 · 7d6f6d74 · 7d6f6d74
隐藏空白更改
内联并排

Showing with 197 addition and 77 deletion

dssm/network_conf.py dssm/network_conf.py +121 -50

dssm/reader.py dssm/reader.py +10 -8

dssm/train.py dssm/train.py +62 -14

dssm/utils.py dssm/utils.py +4 -5

未找到文件。
--- a/dssm/network_conf.py
+++ b/dssm/network_conf.py
 from paddle import v2 as paddle
 from paddle.v2.attr import ParamAttr
-from utils import TaskType, logger, ModelType
+from utils import TaskType, logger, ModelType, ModelArch


 class DSSM(object):
    def __init__(self,
                 dnn_dims=[],
                 vocab_sizes=[],
-                 model_type=ModelType.CLASSIFICATION,
+                 model_type=ModelType.create_classification(),
+                 model_arch=ModelArch.create_cnn(),
                 share_semantic_generator=False,
                 class_num=None,
                 share_embed=False):
@@ -16,8 +17,10 @@ class DSSM(object):
            dimentions of each layer in semantic vector generator.
        @vocab_sizes: 2-d tuple
            size of both left and right items.
-        @model_type: str
-            type of task, should be 'rank', 'regression' or 'classification'
+        @model_type: int
+            type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
+        @model_arch: int
+            model architecture
        @share_semantic_generator: bool
            whether to share the semantic vector generator for both left and right.
        @share_embed: bool
@@ -28,18 +31,36 @@ class DSSM(object):
        assert len(
            vocab_sizes
        ) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2."
+        assert len(dnn_dims) > 1, "more than two layers is needed."

        self.dnn_dims = dnn_dims
        self.vocab_sizes = vocab_sizes
        self.share_semantic_generator = share_semantic_generator
        self.share_embed = share_embed
-        self.model_type = model_type
+        self.model_type = ModelType(model_type)
+        self.model_arch = ModelArch(model_arch)
        self.class_num = class_num
-
+        logger.warning("build DSSM model with config of %s, %s" %
+                       (self.model_type, self.model_arch))
        logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))

+        # bind model architecture
+        _model_arch = {
+            'cnn': self.create_cnn,
+            'fc': self.create_fc,
+        }
+        self.model_arch_creater = _model_arch[str(model_arch)]
+
+        # build model type
+        _model_type = {
+            'classification': self._build_classification_model,
+            'rank': self._build_rank_model,
+            'regression': self._build_regression_model,
+        }
+        self.model_type_creater = _model_type[str(self.model_type)]
+
    def __call__(self):
-        if self.model_type == ModelType.CLASSIFICATION:
+        if self.model_type.is_classification():
            return self._build_classification_model()
        return self._build_rank_model()

@@ -47,6 +68,8 @@ class DSSM(object):
        '''
        Create an embedding table whose name has a `prefix`.
        '''
+        logger.info("create embedding table [%s] which dimention is %d" %
+                    (prefix, self.dnn_dims[0]))
        emb = paddle.layer.embedding(
            input=input,
            size=self.dnn_dims[0],
@@ -66,6 +89,8 @@ class DSSM(object):
            input=emb, pooling_type=paddle.pooling.Max())
        for id, dim in enumerate(self.dnn_dims[1:]):
            name = "%s_fc_%d_%d" % (prefix, id, dim)
+            logger.info("create fc layer [%s] which dimention is %d" % (name,
+                                                                        dim))
            fc = paddle.layer.fc(
                name=name,
                input=_input_layer,
@@ -85,53 +110,49 @@ class DSSM(object):
        @prefix: str
            prefix of layers' names, used to share parameters between more than one `cnn` parts.
        '''
-        pass

-    def _build_classification_model(self):
-        '''
-        Build a classification model, and the cost is returned.
-
-        A Classification has 3 inputs:
-          - source sentence
-          - target sentence
-          - classification label
+        def create_conv(context_len, hidden_size, prefix):
+            key = "%s_%d_%d" % (prefix, context_len, hidden_size)
+            conv = paddle.networks.sequence_conv_pool(
+                input=emb,
+                context_len=context_len,
+                hidden_size=hidden_size,
+                # set parameter attr for parameter sharing
+                context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
+                fc_param_attr=ParamAttr(name=key + '_fc.w'),
+                fc_bias_attr=ParamAttr(name=key + '_fc.b'),
+                pool_bias_attr=ParamAttr(name=key + '_pool.b'))
+            return conv

-        '''
-        # prepare inputs.
-        assert self.class_num
+        logger.info('create a sequence_conv_pool which context width is 3')
+        conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
+        logger.info('create a sequence_conv_pool which context width is 4')
+        conv_4 = create_conv(4, self.dnn_dims[1], "cnn")

-        source = paddle.layer.data(
-            name='source_input',
-            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
-        target = paddle.layer.data(
-            name='target_input',
-            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-        label = paddle.layer.data(
-            name='label_input',
-            type=paddle.data_type.integer_value(self.class_num))
-
-        prefixs = '_ _'.split(
-        ) if self.share_semantic_generator else 'left right'.split()
-        embed_prefixs = '_ _'.split(
-        ) if self.share_embed else 'left right'.split()
-
-        word_vecs = []
-        for id, input in enumerate([source, target]):
-            x = self.create_embedding(input, prefix=embed_prefixs[id])
-            word_vecs.append(x)
+        # if more than three layers, than a fc layer will be added.
+        if len(self.dnn_dims) > 2:
+            _input_layer = [conv_3, conv_4]
+            for id, dim in enumerate(self.dnn_dims[2:]):
+                name = "%s_fc_%d_%d" % (prefix, id, dim)
+                logger.info("create fc layer [%s] which dimention is %d" %
+                            (name, dim))
+                fc = paddle.layer.fc(
+                    name=name,
+                    input=_input_layer,
+                    size=dim,
+                    act=paddle.activation.Tanh(),
+                    param_attr=ParamAttr(name='%s.w' % name),
+                    bias_attr=ParamAttr(name='%s.b' % name))
+                _input_layer = fc
+        return _input_layer

-        semantics = []
-        for id, input in enumerate(word_vecs):
-            x = self.create_fc(input, prefix=prefixs[id])
-            semantics.append(x)
+    def _build_classification_model(self):
+        return self._build_classification_or_regression_model(
+            is_classification=True)

-        concated_vector = paddle.layer.concat(semantics)
-        prediction = paddle.layer.fc(
-            input=concated_vector,
-            size=self.class_num,
-            act=paddle.activation.Softmax())
-        cost = paddle.layer.classification_cost(input=prediction, label=label)
-        return cost, prediction, label
+    def _build_regression_model(self):
+        return self._build_classification_or_regression_model(
+            is_classification=False)

    def _build_rank_model(self):
        '''
@@ -167,7 +188,7 @@ class DSSM(object):

        semantics = []
        for id, input in enumerate(word_vecs):
-            x = self.create_fc(input, prefix=prefixs[id])
+            x = self.model_arch_creater(input, prefix=prefixs[id])
            semantics.append(x)

        # cossim score of source and left_target
@@ -182,6 +203,56 @@ class DSSM(object):
        # so AUC will not used.
        return cost, None, None

+    def _build_classification_or_regression_model(self, is_classification):
+        '''
+        Build a classification model, and the cost is returned.
+
+        A Classification has 3 inputs:
+          - source sentence
+          - target sentence
+          - classification label
+
+        '''
+        # prepare inputs.
+        assert self.class_num
+
+        source = paddle.layer.data(
+            name='source_input',
+            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
+        target = paddle.layer.data(
+            name='target_input',
+            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
+        label = paddle.layer.data(
+            name='label_input',
+            type=paddle.data_type.integer_value(self.class_num)
+            if is_classification else paddle.data_type.dense_input)
+
+        prefixs = '_ _'.split(
+        ) if self.share_semantic_generator else 'left right'.split()
+        embed_prefixs = '_ _'.split(
+        ) if self.share_embed else 'left right'.split()
+
+        word_vecs = []
+        for id, input in enumerate([source, target]):
+            x = self.create_embedding(input, prefix=embed_prefixs[id])
+            word_vecs.append(x)
+
+        semantics = []
+        for id, input in enumerate(word_vecs):
+            x = self.model_arch_creater(input, prefix=prefixs[id])
+            semantics.append(x)
+
+        concated_vector = paddle.layer.concat(semantics)
+        prediction = paddle.layer.fc(
+            input=concated_vector,
+            size=self.class_num,
+            act=paddle.activation.Softmax())
+        cost = paddle.layer.classification_cost(
+            input=prediction,
+            label=label) if is_classification else paddle.layer.mse_cost(
+                prediction, label)
+        return cost, prediction, label
+

 class RankMetrics(object):
    '''

--- a/dssm/reader.py
+++ b/dssm/reader.py
@@ -4,32 +4,34 @@ from utils import UNK, ModelType, TaskType, load_dic, sent2ids, logger, ModelTyp


 class Dataset(object):
-    def __init__(self,
-                 train_path,
-                 test_path,
-                 source_dic_path,
-                 target_dic_path,
-                 model_type=ModelType.RANK):
+    def __init__(self, train_path, test_path, source_dic_path, target_dic_path,
+                 model_type):
        self.train_path = train_path
        self.test_path = test_path
        self.source_dic_path = source_dic_path
        self.target_dic_path = target_dic_path
-        self.model_type = model_type
+        self.model_type = ModelType(model_type)

        self.source_dic = load_dic(self.source_dic_path)
        self.target_dic = load_dic(self.target_dic_path)

        self.record_reader = self._read_classification_record \
-                                if self.model_type == ModelType.CLASSIFICATION \
+                             if self.model_type.is_classification() \
                                        else self._read_rank_record

    def train(self):
+        '''
+        Load trainset.
+        '''
        logger.info("[reader] load trainset from %s" % self.train_path)
        with open(self.train_path) as f:
            for line_id, line in enumerate(f):
                yield self.record_reader(line)

    def test(self):
+        '''
+        Load testset.
+        '''
        logger.info("[reader] load testset from %s" % self.test_path)
        with open(self.test_path) as f:
            for line_id, line in enumerate(f):

--- a/dssm/train.py
+++ b/dssm/train.py
@@ -6,21 +6,24 @@ import gzip
 import paddle.v2 as paddle
 from network_conf import DSSM
 import reader
-from utils import TaskType, load_dic, logger, ModelType
+from utils import TaskType, load_dic, logger, ModelType, ModelArch

 parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")

 parser.add_argument(
+    '-i',
    '--train_data_path',
    type=str,
    required=False,
    help="path of training dataset")
 parser.add_argument(
+    '-t',
    '--test_data_path',
    type=str,
    required=False,
    help="path of testing dataset")
 parser.add_argument(
+    '-s',
    '--source_dic_path',
    type=str,
    required=False,
@@ -32,21 +35,32 @@ parser.add_argument(
    help="path of the target's word dic, if not set, the `source_dic_path` will be used"
 )
 parser.add_argument(
+    '-b',
    '--batch_size',
    type=int,
    default=10,
    help="size of mini-batch (default:10)")
 parser.add_argument(
+    '-p',
    '--num_passes',
    type=int,
    default=10,
    help="number of passes to run(default:10)")
 parser.add_argument(
+    '-y',
    '--model_type',
    type=int,
-    default=ModelType.CLASSIFICATION,
+    required=True,
+    default=ModelType.CLASSIFICATION_MODE,
    help="model type, %d for classification, %d for pairwise rank (default: classification)"
-    % (ModelType.CLASSIFICATION, ModelType.RANK))
+    % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE))
+parser.add_argument(
+    '--model_arch',
+    type=int,
+    required=True,
+    default=ModelArch.CNN_MODE,
+    help="model architecture, %d for CNN, %d for FC" % (ModelArch.CNN_MODE,
+                                                        ModelArch.FC_MODE))
 parser.add_argument(
    '--share_network_between_source_target',
    type=bool,
@@ -61,36 +75,56 @@ parser.add_argument(
    '--dnn_dims',
    type=str,
    default='256,128,64,32',
-    help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, dementions of each layer is 256, 128, 64 and 32"
+    help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
 )
 parser.add_argument(
    '--num_workers', type=int, default=1, help="num worker threads, default 1")
+parser.add_argument(
+    '--use_gpu',
+    type=bool,
+    default=False,
+    help="whether to use GPU devices (default: False)")
+parser.add_argument(
+    '-c',
+    '--class_num',
+    type=int,
+    default=0,
+    help="number of categories for classification task.")

+# arguments check.
 args = parser.parse_args()
 args.model_type = ModelType(args.model_type)
+args.model_arch = ModelArch(args.model_arch)
+if args.model_type.is_classification():
+    assert args.class_num > 1, "--class_num should be set in classification task."

 layer_dims = [int(i) for i in args.dnn_dims.split(',')]
 target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path

+model_save_name_prefix = "dssm_pass_%s_%s" % (args.model_type,
+                                              args.model_arch, )
+

 def train(train_data_path=None,
          test_data_path=None,
          source_dic_path=None,
          target_dic_path=None,
-          model_type=ModelType.CLASSIFICATION,
+          model_type=ModelType.create_classification(),
+          model_arch=ModelArch.create_cnn(),
          batch_size=10,
          num_passes=10,
          share_semantic_generator=False,
          share_embed=False,
          class_num=None,
-          num_workers=1):
+          num_workers=1,
+          use_gpu=False):
    '''
    Train the DSSM.
    '''
    default_train_path = './data/rank/train.txt'
    default_test_path = './data/rank/test.txt'
    default_dic_path = './data/vocab.txt'
-    if model_type == ModelType.CLASSIFICATION:
+    if model_type.is_classification():
        default_train_path = './data/classification/train.txt'
        default_test_path = './data/classification/test.txt'

@@ -107,7 +141,7 @@ def train(train_data_path=None,
        test_path=test_data_path,
        source_dic_path=source_dic_path,
        target_dic_path=target_dic_path,
-        model_type=args.model_type, )
+        model_type=model_type, )

    train_reader = paddle.batch(
        paddle.reader.shuffle(dataset.train, buf_size=1000),
@@ -117,7 +151,7 @@ def train(train_data_path=None,
        paddle.reader.shuffle(dataset.test, buf_size=1000),
        batch_size=batch_size)

-    paddle.init(use_gpu=False, trainer_count=num_workers)
+    paddle.init(use_gpu=use_gpu, trainer_count=num_workers)

    cost, prediction, label = DSSM(
        dnn_dims=layer_dims,
@@ -125,6 +159,7 @@ def train(train_data_path=None,
            len(load_dic(path)) for path in [source_dic_path, target_dic_path]
        ],
        model_type=model_type,
+        model_arch=model_arch,
        share_semantic_generator=share_semantic_generator,
        class_num=class_num,
        share_embed=share_embed)()
@@ -144,7 +179,7 @@ def train(train_data_path=None,
        update_equation=adam_optimizer)

    feeding = {}
-    if model_type == ModelType.CLASSIFICATION:
+    if model_type.is_classification():
        feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2}
    else:
        feeding = {
@@ -165,13 +200,14 @@ def train(train_data_path=None,

        if isinstance(event, paddle.event.EndPass):
            if test_reader is not None:
-                if model_type == ModelType.CLASSIFICATION:
+                if model_type.is_classification():
                    result = trainer.test(reader=test_reader, feeding=feeding)
                    logger.info("Test at Pass %d, %s \n" % (event.pass_id,
                                                            result.metrics))
                else:
                    result = None
-            with gzip.open("dssm_pass_%05d.tar.gz" % event.pass_id, "w") as f:
+            with gzip.open("dssm_%s_pass_%05d.tar.gz" %
+                           (model_save_name_prefix, event.pass_id), "w") as f:
                parameters.to_tar(f)

    trainer.train(
@@ -184,5 +220,17 @@ def train(train_data_path=None,


 if __name__ == '__main__':
-    # train(class_num=2)
-    train(model_type=ModelType.RANK)
+    train(
+        train_data_path=args.train_data_path,
+        test_data_path=args.test_data_path,
+        source_dic_path=args.source_dic_path,
+        target_dic_path=args.target_dic_path,
+        model_type=ModelType(args.model_type),
+        model_arch=ModelArch(args.model_arch),
+        batch_size=args.batch_size,
+        num_passes=args.num_passes,
+        share_semantic_generator=args.share_network_between_source_target,
+        share_embed=args.share_embed,
+        class_num=args.class_num,
+        num_workers=args.num_workers,
+        use_gpu=args.use_gpu)
--- a/dssm/utils.py
+++ b/dssm/utils.py
@@ -43,7 +43,7 @@ def make_create_method(cls):
        setattr(cls, 'create_' + mode, method(mode))


-def make_str_method(cls):
+def make_str_method(cls, type_name='unk'):
    def _str_(self):
        for mode in cls.modes:
            if self.mode == getattr(cls, mode_attr_name(mode)):
@@ -55,6 +55,7 @@ def make_str_method(cls):
    setattr(cls, '__str__', _str_)
    setattr(cls, '__repr__', _str_)
    setattr(cls, '__hash__', _hash_)
+    cls.__name__ = type_name


 def _init_(self, mode, cls):
@@ -63,7 +64,8 @@ def _init_(self, mode, cls):
    elif isinstance(mode, cls):
        self.mode = mode.mode
    else:
-        raise
+        raise Exception("wrong mode type, get type: %s, value: %s" %
+                        (type(mode), mode))


 def build_mode_class(cls):
@@ -74,9 +76,6 @@ def build_mode_class(cls):


 class TaskType(object):
-    # TRAIN_MODE = 0
-    # TEST_MODE = 1
-    # INFER_MODE = 2
    modes = 'train test infer'.split()

    def __init__(self, mode):