From f3eb9cb36a37efd58cbbf91f9f9e4c888bec1d65 Mon Sep 17 00:00:00 2001
From: yangyaming <yangyaming@baidu.com>
Date: Wed, 17 May 2017 16:38:21 +0800
Subject: [PATCH] Override getValueImpl and revise document

---
 paddle/gserver/evaluators/ChunkEvaluator.cpp  | 36 +++++-------
 .../trainer_config_helpers/evaluators.py      | 57 +++++++++++++------
 2 files changed, 56 insertions(+), 37 deletions(-)
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index b94a641b4..1658282f3 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -75,6 +75,7 @@ class ChunkEvaluator : public Evaluator {
   std::vector<Segment> labelSegments_;
   std::vector<Segment> outputSegments_;
   std::set<int> excludedChunkTypes_;
+  mutable std::unordered_map<std::string, real> values_;
 
 public:
   virtual void init(const EvaluatorConfig& config) {
@@ -243,23 +244,22 @@ public:
     return false;
   }
 
-public:
   // three metrics: precision, recall and F1-score
   void getNames(std::vector<std::string>* names) {
-    this->storeLocalValues();
-    names->reserve(this->values_.size());
-    for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
-      names->push_back(this->config_.name() + "." + it->first);
+    storeLocalValues();
+    names->reserve(names->size() + values_.size());
+    for (auto it = values_.begin(); it != values_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
     }
   }
 
   // get value by field name
   real getValue(const std::string& name, Error* err) const {
-    this->storeLocalValues();
+    storeLocalValues();
     std::vector<std::string> buffers;
     paddle::str::split(name, '.', &buffers);
-    auto it = this->values_.find(buffers[buffers.size() - 1]);
-    if (it == this->values_.end()) {  // not found
+    auto it = values_.find(buffers.back());
+    if (it == values_.end()) {  // not found
       *err = Error("No such key %s", name.c_str());
       return 0.0f;
     }
@@ -268,27 +268,21 @@ public:
   }
 
   // get type of evaluator
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return std::string();
-    }
-    return "chunk";
-  }
+  std::string getTypeImpl() const { return "chunk"; }
 
 private:
   void storeLocalValues() const {
-    CHECK_GT(numOutputSegments_, 0);
-    CHECK_GT(numLabelSegments_, 0);
-    double precision = (double)numCorrect_ / numOutputSegments_;
-    double recall = (double)numCorrect_ / numLabelSegments_;
+    CHECK_GE(numOutputSegments_, 0);
+    CHECK_GE(numLabelSegments_, 0);
+    double precision =
+        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
+    double recall =
+        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
     values_["precision"] = precision;
     values_["recall"] = recall;
     values_["F1-score"] =
         !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
   }
-
-  mutable std::unordered_map<std::string, real> values_;
 };
 
 REGISTER_EVALUATOR(chunk, ChunkEvaluator);
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 8704a8cde..6900133fd 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -347,45 +347,68 @@ def chunk_evaluator(
         excluded_chunk_types=None, ):
     """
     Chunk evaluator is used to evaluate segment labelling accuracy for a
-    sequence. It calculates precision, recall and F1 score of the chunk detection.
+    sequence. It calculates precision, recall and F1 scores for the chunk detection.
 
-    To use chunk evaluator, the construction of label dict should obey the following rules:
+    To use chunk evaluator, several concepts need to be clarified firstly.
+    Chunk type is the type of the whole chunk and a chunk consists of one or several words.  (For example in NER, ORG for organization name, PER for person name etc.)
+    Tag indicates the position of a word in a chunk. (B for begin, I for inside, E for end, S for single)
+    We can name a label by combining tag type and chunk type. (ie. B-ORG for begining of an organization name)
+
+    The construction of label dict should obey the following rules:
     (1) Use one of the listed labelling schemes. These schemes differ in ways indicating chunk boundry.
 
     .. code-block:: python
-     Scheme Begin Inside End   Single
-      plain  0     -      -     -
-      IOB    0     1      -     -
-      IOE    -     0      1     -
-      IOBES  0     1      2     3
+     Scheme    Description                                                                                  
+      plain    Use the same label for the whole chunk.
+      IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside. 
+      IOE      Two labels for chunk type X, E-X for chunk ending and I-X for chunk inside.
+      IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk. 
     .. code-block:: python
-
-    To make it clear, let's illustrate by a NER example.
-    Assuming that there are two named entity types including ORG and PER which are called 'chunk type' here,
-    if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER and O,
-    in which B-ORG for begining of ORG and I-ORG for end of ORG.
+   
+    To make it clear, let's illustrate by an NER example.
+    Assuming that there are three named entity types including ORG, PER and LOC which are called 'chunk type' here,
+    if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER, B-LOC, I-LOC and O,
+    in which B-ORG for begining of ORG and I-ORG for inside of ORG.
     Prefixes which are called 'tag type' here are added to chunk types and there are two tag types including B and I.
     Of course, the training data should be labeled accordingly.
 
-    (2) Map can be done correctly by the listed equations.
+    (2) Mapping is done correctly by the listed equations and assigning protocol.
+
+    The following table are equations to extract tag type and chunk type from a label.
 
     .. code-block:: python
     tagType = label % numTagType
     chunkType = label / numTagType
     otherChunkType = numChunkTypes
     .. code-block:: python
+    
+    The following table shows the mapping rule between tagType and tag type in each scheme.
 
-    Continue the NER example, and the label dict should like this to satify above equations:
+    .. code-block:: python
+     Scheme Begin Inside End   Single
+      plain  0     -      -     -
+      IOB    0     1      -     -
+      IOE    -     0      1     -
+      IOBES  0     1      2     3
+    .. code-block:: python
+
+    Continue the NER example, and the label dict should look like this to satify above equations:
 
     .. code-block:: python
       B-ORG  0
       I-ORG  1
       B-PER  2
       I-PER  3
-      O      4
+      B-LOC  4
+      I-LOC  5
+      O      6
     .. code-block:: python
 
-    Realizing that the number of is chunk type is 2 and number of tag type is 2, it is easy to validate this.
+    In this example, chunkType has three values: 0 for ORG, 1 for PER, 2 for LOC, because the scheme is
+    "IOB" so tagType has two values: 0 for B and 1 for I. 
+    Here we will use I-LOC to explain the above mapping rules in detail.
+    For I-LOC, the label id is 5, so we can get tagType=1 and ChunkType=2, which means I-LOC is a part of NER chunk LOC
+    and the tag is I.
 
     The simple usage is:
 
@@ -393,6 +416,8 @@ def chunk_evaluator(
 
        eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
 
+    .. code-block:: python
+    
     :param input: The input layers.
     :type input: LayerOutput
     :param label: An input layer containing the ground truth label.
-- 
GitLab