fix regression bug

34654503 · Superjom · b645b46b · 34654503 · 34654503 · 34654503
5 changed file
--- a/dssm/network_conf.py
+++ b/dssm/network_conf.py
@@ -64,12 +64,14 @@ class DSSM(object):
            'rank': self._build_rank_model,
            'regression': self._build_regression_model,
        }
+        print 'model type: ', str(self.model_type)
        self.model_type_creater = _model_type[str(self.model_type)]

    def __call__(self):
-        if self.model_type.is_classification():
-            return self._build_classification_model()
-        return self._build_rank_model()
+        # if self.model_type.is_classification():
+        #     return self._build_classification_model()
+        # return self._build_rank_model()
+        return self.model_type_creater()

    def create_embedding(self, input, prefix=''):
        '''
@@ -155,10 +157,14 @@ class DSSM(object):
        return _input_layer

    def _build_classification_model(self):
+        logger.info("build classification model")
+        assert self.model_type.is_classification()
        return self._build_classification_or_regression_model(
            is_classification=True)

    def _build_regression_model(self):
+        logger.info("build regression model")
+        assert self.model_type.is_regression()
        return self._build_classification_or_regression_model(
            is_classification=False)

@@ -172,6 +178,8 @@ class DSSM(object):
          - right_target sentence
          - label, 1 if left_target should be sorted in front of right_target, otherwise 0.
        '''
+        logger.info("build rank model")
+        assert self.model_type.is_rank()
        source = paddle.layer.data(
            name='source_input',
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
@@ -221,6 +229,7 @@ class DSSM(object):
          - classification label

        '''
+        if is_classification:
            # prepare inputs.
            assert self.class_num

@@ -233,7 +242,7 @@ class DSSM(object):
        label = paddle.layer.data(
            name='label_input',
            type=paddle.data_type.integer_value(self.class_num)
-            if is_classification else paddle.data_type.dense_input)
+            if is_classification else paddle.data_type.dense_vector(1))

        prefixs = '_ _'.split(
        ) if self.share_semantic_generator else 'left right'.split()
@@ -250,15 +259,17 @@ class DSSM(object):
            x = self.model_arch_creater(input, prefix=prefixs[id])
            semantics.append(x)

+        if is_classification:
            concated_vector = paddle.layer.concat(semantics)
            prediction = paddle.layer.fc(
                input=concated_vector,
                size=self.class_num,
                act=paddle.activation.Softmax())
            cost = paddle.layer.classification_cost(
-            input=prediction,
-            label=label) if is_classification else paddle.layer.mse_cost(
-                prediction, label)
+                input=prediction, label=label)
+        else:
+            prediction = paddle.layer.cos_sim(*semantics)
+            cost = paddle.layer.mse_cost(prediction, label)
        return cost, prediction, label



--- a/dssm/reader.py
+++ b/dssm/reader.py
@@ -15,9 +15,14 @@ class Dataset(object):
        self.source_dic = load_dic(self.source_dic_path)
        self.target_dic = load_dic(self.target_dic_path)

-        self.record_reader = self._read_classification_record \
-                             if self.model_type.is_classification() \
-                                        else self._read_rank_record
+        _record_reader = {
+            ModelType.CLASSIFICATION_MODE: self._read_classification_record,
+            ModelType.REGRESSION_MODE: self._read_regression_record,
+            ModelType.RANK_MODE: self._read_rank_record,
+        }
+
+        assert isinstance(model_type, ModelType)
+        self.record_reader = _record_reader[model_type.mode]

    def train(self):
        '''
@@ -54,6 +59,23 @@ class Dataset(object):
        label = int(fs[2])
        return (source, target, label, )

+    def _read_regression_record(self, line):
+        '''
+        data format:
+            <source words> [TAB] <target words> [TAB] <label>
+
+        @line: str
+            a string line which represent a record.
+        '''
+        fs = line.strip().split('\t')
+        assert len(fs) == 3, "wrong format for regression\n" + \
+            "the format shoud be " +\
+            "<source words> [TAB] <target words> [TAB] <label>'"
+        source = sent2ids(fs[0], self.source_dic)
+        target = sent2ids(fs[1], self.target_dic)
+        label = float(fs[2])
+        return (source, target, [label], )
+
    def _read_rank_record(self, line):
        '''
        data format:

--- a/dssm/train.py
+++ b/dssm/train.py
@@ -52,8 +52,9 @@ parser.add_argument(
    type=int,
    required=True,
    default=ModelType.CLASSIFICATION_MODE,
-    help="model type, %d for classification, %d for pairwise rank (default: classification)"
-    % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE))
+    help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
+    % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
+       ModelType.REGRESSION_MODE))
 parser.add_argument(
    '--model_arch',
    type=int,
@@ -124,7 +125,7 @@ def train(train_data_path=None,
    default_train_path = './data/rank/train.txt'
    default_test_path = './data/rank/test.txt'
    default_dic_path = './data/vocab.txt'
-    if model_type.is_classification():
+    if not model_type.is_rank():
        default_train_path = './data/classification/train.txt'
        default_test_path = './data/classification/test.txt'

@@ -173,13 +174,18 @@ def train(train_data_path=None,

    trainer = paddle.trainer.SGD(
        cost=cost,
-        extra_layers=paddle.evaluator.auc(input=prediction, label=label)
-        if prediction else None,
+        extra_layers=None,
        parameters=parameters,
        update_equation=adam_optimizer)
+    # trainer = paddle.trainer.SGD(
+    #     cost=cost,
+    #     extra_layers=paddle.evaluator.auc(input=prediction, label=label)
+    #     if prediction and model_type.is_classification() else None,
+    #     parameters=parameters,
+    #     update_equation=adam_optimizer)

    feeding = {}
-    if model_type.is_classification():
+    if model_type.is_classification() or model_type.is_regression():
        feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2}
    else:
        feeding = {

--- a/sequence_tagging_for_ner/README.md
+++ b/sequence_tagging_for_ner/README.md
@@ -23,10 +23,10 @@

 序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：

-<div  align="center">
+<p  align="center">
 <img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
 图1. BIO标注方法示例
-</div>
+</p>

 根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。


--- a/sequence_tagging_for_ner/index.html
+++ b/sequence_tagging_for_ner/index.html
@@ -65,10 +65,10 @@

 序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：

-<div  align="center">
+<p  align="center">
 <img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
 图1. BIO标注方法示例
-</div>
+</p>

 根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。