From e1f57bfd66245f78d04f47d670cba5592d5734b2 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 19 Oct 2016 15:19:32 +0800
Subject: [PATCH] add base class for seqlastin/max/average layer (#187)

---
 paddle/gserver/layers/AverageLayer.cpp        | 74 ++--------------
 paddle/gserver/layers/AverageLayer.h          | 19 ++---
 paddle/gserver/layers/MaxLayer.cpp            | 79 ++---------------
 paddle/gserver/layers/MaxLayer.h              | 19 +++--
 .../layers/SequenceLastInstanceLayer.cpp      | 82 +++---------------
 paddle/gserver/layers/SequencePoolLayer.cpp   | 84 +++++++++++++++++++
 paddle/gserver/layers/SequencePoolLayer.h     | 57 +++++++++++++
 7 files changed, 188 insertions(+), 226 deletions(-)
 create mode 100644 paddle/gserver/layers/SequencePoolLayer.cpp
 create mode 100644 paddle/gserver/layers/SequencePoolLayer.h
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index 6e52217de4..7401cdc9a5 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
 
 bool AverageLayer::init(const LayerMap& layerMap,
                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
+  SequencePoolLayer::init(layerMap, parameterMap);
 
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
   dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
   outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
   // average strategy
@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
   } else {
     LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
   }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
   return true;
 }
 
 void AverageLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  // average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
+  SequencePoolLayer::forward(passType);
 
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
-  }
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  size_t numSequences = startPositions->getSize() - 1;
-
-  // check
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(starts[numSequences], input.getBatchSize());
-  CHECK_EQ(dim, input.value->getWidth());
-
-  resetOutput(newBatchSize, dim);
-  auto startsPos = startPositions->getVector(useGpu_);
   MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
-  }
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
 
   /* add the bias-vector AFTER average operation */
   if (biases_.get() != NULL) {
@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
 }
 
 void AverageLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
+  const int* starts = startPositions_->getData(false);
   MatrixPtr grad = getInputGrad(0);
+
   if (grad) {
     size_t dim = getSize();
     real* gradientData = getInputGrad(0)->getData();
     real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions->getSize() - 1;
+    size_t numSequences = startPositions_->getSize() - 1;
     for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
       // TODO(Dangqingqing) optimization for GPU
       int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index ae910ddefa..1edc2ace49 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
@@ -23,20 +22,21 @@ namespace paddle {
 /**
  * A layer for "internal average" for sequence input.
  * Input: one or more sequences. Each sequence contains some instances.
- * If AverageLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = average_{for each instance in this sequence}{input[i]}
- * If AverageLevel = kSeq:
+ * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
  *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
  */
-
-class AverageLayer : public Layer {
+class AverageLayer : public SequencePoolLayer {
 public:
   enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  enum AverageLevel { kNonSeq = 0, kSeq = 1 };
-  explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+  explicit AverageLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
 
   ~AverageLayer() {}
 
@@ -46,11 +46,8 @@ public:
   void backward(const UpdateCallback& callback = nullptr);
 
 protected:
-  std::unique_ptr<Weight> biases_;
   MatrixPtr outMtx_;
   MatrixPtr dataMtx_;
   int mode_;
-  int type_;
 };
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
index 226e0ea87d..c4ffe894ec 100644
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MaxLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -21,55 +20,11 @@ namespace paddle {
 
 REGISTER_LAYER(max, MaxLayer);
 
-bool MaxLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
 void MaxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  // max layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  auto starts = startPositions->getVector(useGpu_);
-  size_t numSequences = startPositions->getSize() - 1;
+  SequencePoolLayer::forward(passType);
 
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-
-  // reset output: resize to "num of sequences", not "batch size".
-  resetOutput(newBatchSize, dim);
-
-  IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
+  IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
+                          useGpu(deviceId_));
   maxIndex_->zeroMem();
 
   MatrixPtr inputValue = getInputValue(0);
@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
 
   {
     REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
-  }
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no cpuSequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new cpuSequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
+    outputValue->maxSequenceForward(
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
   }
 
   if (config_.output_max_index()) {
@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
 void MaxLayer::backward(const UpdateCallback& callback) {
   CHECK(!config_.output_max_index())
       << "backward is not available when output_max_index is set";
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
   if (inputGrad) {
-    ICpuGpuVectorPtr starts =
-        type_ ? getInput(0).subSequenceStartPositions
-              : getInput(0).sequenceStartPositions;
     REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(*outputGrad,
-        *(starts->getVector(useGpu_)), *maxIndex_);
+    inputGrad->maxSequenceBackward(
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
   }
 }
 
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index b4c34e665d..e6dcfe9c67 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ThreadLocal.h"
 
@@ -24,29 +24,30 @@ namespace paddle {
 /**
  * A layer for "internal max" for sequence input.
  * Input: one or more sequences. Each sequence contains some instances.
- * If MaxLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = max_{for each instance in this sequence}{input[i]}
- * If MaxLevel = kSeq:
+ * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
  *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
  */
 
-class MaxLayer : public Layer {
+class MaxLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
   // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
   IVectorPtr maxIndex_;
-  int type_;
 
 public:
-  explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
-  enum MaxLevel {kNonSeq = 0, kSeq = 1 };
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
   ~MaxLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
 
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index f4d26ba21b..26d9536dd5 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #include "paddle/utils/Logging.h"
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
 
@@ -29,20 +29,19 @@ namespace paddle {
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
- * of the input sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
  */
 
-class SequenceLastInstanceLayer : public Layer {
+class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  int type_;
 
 public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : Layer(config) {}
+      : SequencePoolLayer(config) {}
 
   ~SequenceLastInstanceLayer() {}
 
@@ -56,56 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // seqlastins layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
+  SequencePoolLayer::init(layerMap, parameterMap);
 
   tmpSrc_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
   tmpDest_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
 
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
   return true;
 }
 
 void SequenceLastInstanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
+  SequencePoolLayer::forward(passType);
 
-  // check
-  CHECK(input.sequenceStartPositions);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
-  }
-  auto startPositions =
-      type_ ? input.subSequenceStartPositions->getVector(false)
-            : input.sequenceStartPositions->getVector(false);
-  size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
-  CHECK_EQ(height, startPositions->getSize() - 1);
-
-  reserveOutput(height, dim);
-  const int* starts = startPositions->getData();
+  const int* starts = startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
@@ -113,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
 
-    for (size_t seqId = 0; seqId < height; ++seqId) {
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
       int insId =
           config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
 
       outputValue->subMatrix(seqId, 1, tmpDest_)
           ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
     }
-    /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-     * thus, in this case, output_ has no sequenceStartPositions.
-     * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-     * case, we should compute the new sequenceStartPositions.
-    */
-    if (type_) {
-      output_.degradeSequence(input, useGpu_);
-    }
   }
 
   if (biases_.get() != NULL) {
@@ -139,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
 }
 
 void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  /* activation, should set to 'linear' in most cases */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions =
-      type_ ? getInput(0).subSequenceStartPositions->getVector(false)
-            : getInput(0).sequenceStartPositions->getVector(false);
-  const int* starts = startPositions->getData();
-  size_t numSequences = startPositions->getSize() - 1;
+  const int* starts = startPositions_->getData(false);
+  size_t numSequences = startPositions_->getSize() - 1;
 
   if (inputGrad) {
     AsyncGpuBlock asyncGpuBlock;
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
new file mode 100644
index 0000000000..55be73d363
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "SequencePoolLayer.h"
+
+namespace paddle {
+
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+
+  resetOutput(newBatchSize_, dim);
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+      << "when trans_type = seq, input must hasSubseq";
+  }
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+}
+
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
new file mode 100644
index 0000000000..669af80e1d
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class SequencePoolLayer : public Layer {
+protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+
+public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~SequencePoolLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
-- 
GitLab