From 34654503aadad1efb4d9de96eb016381c62407db Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Sun, 9 Jul 2017 17:25:04 +0800
Subject: [PATCH] fix regression bug

---
 dssm/network_conf.py                | 41 ++++++++++++++++++-----------
 dssm/reader.py                      | 28 +++++++++++++++++---
 dssm/train.py                       | 18 ++++++++-----
 sequence_tagging_for_ner/README.md  |  4 +--
 sequence_tagging_for_ner/index.html |  4 +--
 5 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/dssm/network_conf.py b/dssm/network_conf.py
index e88152f3..91607982 100644
--- a/dssm/network_conf.py
+++ b/dssm/network_conf.py
@@ -64,12 +64,14 @@ class DSSM(object):
             'rank': self._build_rank_model,
             'regression': self._build_regression_model,
         }
+        print 'model type: ', str(self.model_type)
         self.model_type_creater = _model_type[str(self.model_type)]
 
     def __call__(self):
-        if self.model_type.is_classification():
-            return self._build_classification_model()
-        return self._build_rank_model()
+        # if self.model_type.is_classification():
+        #     return self._build_classification_model()
+        # return self._build_rank_model()
+        return self.model_type_creater()
 
     def create_embedding(self, input, prefix=''):
         '''
@@ -155,10 +157,14 @@ class DSSM(object):
         return _input_layer
 
     def _build_classification_model(self):
+        logger.info("build classification model")
+        assert self.model_type.is_classification()
         return self._build_classification_or_regression_model(
             is_classification=True)
 
     def _build_regression_model(self):
+        logger.info("build regression model")
+        assert self.model_type.is_regression()
         return self._build_classification_or_regression_model(
             is_classification=False)
 
@@ -172,6 +178,8 @@ class DSSM(object):
           - right_target sentence
           - label, 1 if left_target should be sorted in front of right_target, otherwise 0.
         '''
+        logger.info("build rank model")
+        assert self.model_type.is_rank()
         source = paddle.layer.data(
             name='source_input',
             type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
@@ -221,8 +229,9 @@ class DSSM(object):
           - classification label
 
         '''
-        # prepare inputs.
-        assert self.class_num
+        if is_classification:
+            # prepare inputs.
+            assert self.class_num
 
         source = paddle.layer.data(
             name='source_input',
@@ -233,7 +242,7 @@ class DSSM(object):
         label = paddle.layer.data(
             name='label_input',
             type=paddle.data_type.integer_value(self.class_num)
-            if is_classification else paddle.data_type.dense_input)
+            if is_classification else paddle.data_type.dense_vector(1))
 
         prefixs = '_ _'.split(
         ) if self.share_semantic_generator else 'left right'.split()
@@ -250,15 +259,17 @@ class DSSM(object):
             x = self.model_arch_creater(input, prefix=prefixs[id])
             semantics.append(x)
 
-        concated_vector = paddle.layer.concat(semantics)
-        prediction = paddle.layer.fc(
-            input=concated_vector,
-            size=self.class_num,
-            act=paddle.activation.Softmax())
-        cost = paddle.layer.classification_cost(
-            input=prediction,
-            label=label) if is_classification else paddle.layer.mse_cost(
-                prediction, label)
+        if is_classification:
+            concated_vector = paddle.layer.concat(semantics)
+            prediction = paddle.layer.fc(
+                input=concated_vector,
+                size=self.class_num,
+                act=paddle.activation.Softmax())
+            cost = paddle.layer.classification_cost(
+                input=prediction, label=label)
+        else:
+            prediction = paddle.layer.cos_sim(*semantics)
+            cost = paddle.layer.mse_cost(prediction, label)
         return cost, prediction, label
 
 
diff --git a/dssm/reader.py b/dssm/reader.py
index d69d88ec..8664c98d 100644
--- a/dssm/reader.py
+++ b/dssm/reader.py
@@ -15,9 +15,14 @@ class Dataset(object):
         self.source_dic = load_dic(self.source_dic_path)
         self.target_dic = load_dic(self.target_dic_path)
 
-        self.record_reader = self._read_classification_record \
-                             if self.model_type.is_classification() \
-                                        else self._read_rank_record
+        _record_reader = {
+            ModelType.CLASSIFICATION_MODE: self._read_classification_record,
+            ModelType.REGRESSION_MODE: self._read_regression_record,
+            ModelType.RANK_MODE: self._read_rank_record,
+        }
+
+        assert isinstance(model_type, ModelType)
+        self.record_reader = _record_reader[model_type.mode]
 
     def train(self):
         '''
@@ -54,6 +59,23 @@ class Dataset(object):
         label = int(fs[2])
         return (source, target, label, )
 
+    def _read_regression_record(self, line):
+        '''
+        data format:
+            <source words> [TAB] <target words> [TAB] <label>
+
+        @line: str
+            a string line which represent a record.
+        '''
+        fs = line.strip().split('\t')
+        assert len(fs) == 3, "wrong format for regression\n" + \
+            "the format shoud be " +\
+            "<source words> [TAB] <target words> [TAB] <label>'"
+        source = sent2ids(fs[0], self.source_dic)
+        target = sent2ids(fs[1], self.target_dic)
+        label = float(fs[2])
+        return (source, target, [label], )
+
     def _read_rank_record(self, line):
         '''
         data format:
diff --git a/dssm/train.py b/dssm/train.py
index a62bdabf..a4678b15 100644
--- a/dssm/train.py
+++ b/dssm/train.py
@@ -52,8 +52,9 @@ parser.add_argument(
     type=int,
     required=True,
     default=ModelType.CLASSIFICATION_MODE,
-    help="model type, %d for classification, %d for pairwise rank (default: classification)"
-    % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE))
+    help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
+    % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
+       ModelType.REGRESSION_MODE))
 parser.add_argument(
     '--model_arch',
     type=int,
@@ -124,7 +125,7 @@ def train(train_data_path=None,
     default_train_path = './data/rank/train.txt'
     default_test_path = './data/rank/test.txt'
     default_dic_path = './data/vocab.txt'
-    if model_type.is_classification():
+    if not model_type.is_rank():
         default_train_path = './data/classification/train.txt'
         default_test_path = './data/classification/test.txt'
 
@@ -173,13 +174,18 @@ def train(train_data_path=None,
 
     trainer = paddle.trainer.SGD(
         cost=cost,
-        extra_layers=paddle.evaluator.auc(input=prediction, label=label)
-        if prediction else None,
+        extra_layers=None,
         parameters=parameters,
         update_equation=adam_optimizer)
+    # trainer = paddle.trainer.SGD(
+    #     cost=cost,
+    #     extra_layers=paddle.evaluator.auc(input=prediction, label=label)
+    #     if prediction and model_type.is_classification() else None,
+    #     parameters=parameters,
+    #     update_equation=adam_optimizer)
 
     feeding = {}
-    if model_type.is_classification():
+    if model_type.is_classification() or model_type.is_regression():
         feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2}
     else:
         feeding = {
diff --git a/sequence_tagging_for_ner/README.md b/sequence_tagging_for_ner/README.md
index 13d1c8d6..20712659 100644
--- a/sequence_tagging_for_ner/README.md
+++ b/sequence_tagging_for_ner/README.md
@@ -23,10 +23,10 @@
 
 序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
 
-<div  align="center">
+<p  align="center">
 <img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
 图1. BIO标注方法示例
-</div>
+</p>
 
 根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
 
diff --git a/sequence_tagging_for_ner/index.html b/sequence_tagging_for_ner/index.html
index 1706bad2..68b6413a 100644
--- a/sequence_tagging_for_ner/index.html
+++ b/sequence_tagging_for_ner/index.html
@@ -65,10 +65,10 @@
 
 序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
 
-<div  align="center">
+<p  align="center">
 <img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
 图1. BIO标注方法示例
-</div>
+</p>
 
 根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
 
-- 
GitLab