diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers/layer.rst
index 807b22ca140ee71208a96e2877b9c5636620b165..4b8e149505f0695ad2fa4be967a50d1a0ac48b43 100644
--- a/doc/source/gserver/layers/layer.rst
+++ b/doc/source/gserver/layers/layer.rst
@@ -465,6 +465,11 @@ SumOfSquaresCostLayer
 ..  doxygenclass:: paddle::SumOfSquaresCostLayer
     :members:
 
+SumCostLayer
+`````````````````````
+..  doxygenclass:: paddle::SumCostLayer
+    :members:
+
 CosSimLayer
 -----------
 ..  doxygenclass:: paddle::CosSimLayer
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index c78682423e448a472ca46f2cb100a40efface6eb..56b23640205ec8c7575541bd270720fb861457a1 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -46,6 +46,12 @@ conv_operator
     :members: conv_operator
     :noindex:
 
+conv_projection
+-------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: conv_projection
+    :noindex:
+
 conv_shift_layer
 ------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -71,6 +77,12 @@ img_pool_layer
 --------------
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: img_pool_layer
+    :noindex:   
+
+spp_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: spp_layer
     :noindex:
 
 maxout_layer
@@ -254,6 +266,12 @@ expand_layer
     :members: expand_layer
     :noindex:
 
+repeat_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: repeat_layer
+    :noindex:
+
 Math Layers
 ===========
 
@@ -401,6 +419,12 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
+sum_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_cost
+    :noindex:
+
 Check Layer 
 ============
 
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index ac35727ac28c7ee7a41d7d3d93d7c18288950a41..70b5be6fda2509853029a68d31129df28d580942 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -91,6 +91,7 @@ extern void hl_expand_feature2col(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
  *
  */
 extern void hl_maxpool_forward(
@@ -100,7 +101,8 @@ extern void hl_maxpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData);
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -123,6 +125,7 @@ extern void hl_maxpool_forward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  targetGrad  output grad.
+ * @param[in]   outStride   stride between output data samples. 
  *
  */
 extern void hl_maxpool_backward(
@@ -135,7 +138,7 @@ extern void hl_maxpool_backward(
     const int strideH, const int strideW,
     const int paddingH, const int paddingW,
     real scaleA, real scaleB,
-    real* targetGrad);
+    real* targetGrad, const int outStride);
 
 /**
  * @brief   Averge pool forward.
@@ -154,6 +157,7 @@ extern void hl_maxpool_backward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
  *
  */
 extern void hl_avgpool_forward(
@@ -163,7 +167,8 @@ extern void hl_avgpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData);
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -184,6 +189,7 @@ extern void hl_avgpool_forward(
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
+ * @param[in]   outStride   stride between output data samples. 
  *
  */
 extern void hl_avgpool_backward(
@@ -195,7 +201,7 @@ extern void hl_avgpool_backward(
     const int strideH, const int strideW,
     int paddingH, int paddingW,
     real scaleA, real scaleB,
-    real* backGrad);
+    real* backGrad, const int outStride);
 
 /**
  * @brief   Cross-map-respose normalize forward.
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 50fddce584252b459b1146b8528e2918416aff95..c6f32ad337705ff938b7b370a4785dc7f4393041 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -44,7 +44,8 @@ inline void hl_maxpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData) {}
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride) {}
 
 inline void hl_maxpool_backward(
     const int frameCnt, const real* inputData,
@@ -56,7 +57,7 @@ inline void hl_maxpool_backward(
     const int strideH, const int strideW,
     const int paddingH, const int paddingW,
     real scaleA, real scaleB,
-    real* targetGrad) {}
+    real* targetGrad, const int outStride) {}
 
 inline void hl_avgpool_forward(
     const int frameCnt, const real* inputData,
@@ -65,7 +66,8 @@ inline void hl_avgpool_forward(
     const int pooledH, const int pooledW,
     const int sizeX, const int sizeY,
     const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData) {}
+    const int paddingH, const int paddingW,
+    real* tgtData, const int tgtStride) {}
 
 inline void hl_avgpool_backward(
     const int frameCnt, const real* outGrad,
@@ -76,7 +78,7 @@ inline void hl_avgpool_backward(
     const int strideH, const int strideW,
     int paddingH, int paddingW,
     real scaleA, real scaleB,
-    real* backGrad) {}
+    real* backGrad, const int outStride) {}
 
 inline void hl_CMRNorm_forward(
     size_t frameCnt, const real* in, real* scale, real* out,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 9eec44f77f27a0bf29d3b68260d663a5687d1b0c..ae387a8bc0e0791995810df9e5f2556264d869b1 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -152,7 +152,7 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
                                  const int ksizeW, const int ksizeH,
                                  const int strideH, const int strideW,
                                  const int offsetH, const int offsetW,
-                                 real* tgtData) {
+                                 real* tgtData, const int tgtStride) {
   int index =  blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -173,7 +173,9 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
           maxval = inputData[h * width + w];
       }
     }
-    tgtData[index] = maxval;
+    int tgtIndex = index % (pooledW * pooledH * channels) +
+        frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
   }
 }
 
@@ -184,7 +186,7 @@ void hl_maxpool_forward(const int frameCnt, const real* inputData,
                         const int sizeX, const int sizeY,
                         const int strideH, const int strideW,
                         const int paddingH, const int paddingW,
-                        real* tgtData) {
+                        real* tgtData, const int tgtStride) {
 
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -194,7 +196,7 @@ void hl_maxpool_forward(const int frameCnt, const real* inputData,
   KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels, height, width,
            pooledH, pooledW, sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData);
+           paddingH, paddingW, tgtData, tgtStride);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
@@ -207,7 +209,7 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
                                   const int strideH, const int strideW,
                                   const int padH, const int padW,
                                   real scaleA, real scaleB,
-                                  real* targetGrad) {
+                                  real* targetGrad, const int outStride) {
   int index = blockIdx.x  * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     // find out the local index
@@ -223,8 +225,8 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
     int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
     real input = inputData[index];
-    outData += (frameNum * channels + offsetC) * pooledH * pooledW;
-    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    outData += (frameNum * outStride + offsetC * pooledH * pooledW);
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         if (input == outData[ph * pooledW + pw]) {
@@ -246,7 +248,7 @@ void hl_maxpool_backward(const int frameCnt, const real* inputData,
                         const int strideH, const int strideW,
                         const int paddingH, const int paddingW,
                         real scaleA, real scaleB,
-                        real* targetGrad) {
+                        real* targetGrad, const int outStride) {
 
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -257,7 +259,7 @@ void hl_maxpool_backward(const int frameCnt, const real* inputData,
            strideH, strideW,
            paddingH, paddingW,
            scaleA, scaleB,
-           targetGrad);
+           targetGrad, outStride);
   CHECK_SYNC("hl_maxpool_backward");
 }
 
@@ -268,7 +270,7 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
                                  const int sizeX, const int sizeY,
                                  const int strideH, const int strideW,
                                  const int padH, const int padW,
-                                 real* tgtData) {
+                                 real* tgtData, const int tgtStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -293,7 +295,9 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
         aveval += inputData[h * width + w];
       }
     }
-    tgtData[index] = aveval / pool_size;
+    int tgtIndex = index % (pooledW * pooledH * channels) +
+        frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
   }
 }
 
@@ -303,14 +307,15 @@ void hl_avgpool_forward(const int frameCnt, const real* inputData,
                         const int pooledH, const int pooledW,
                         const int sizeX, const int sizeY,
                         const int strideH, const int strideW,
-                        const int paddingH, const int paddingW, real* tgtData) {
+                        const int paddingH, const int paddingW, 
+                        real* tgtData, const int tgtStride) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels,
            height, width, pooledH, pooledW,
            sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData);
+           paddingH, paddingW, tgtData, tgtStride);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
@@ -322,7 +327,7 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
                                   const int strideH, const int strideW,
                                   const int padH, const int padW,
                                   real scaleA, real scaleB,
-                                  real* tgtGrad) {
+                                  real* tgtGrad, const int outStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -335,7 +340,8 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
     int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
     int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
-    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
+
 
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
@@ -360,7 +366,7 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
                          const int strideH, const int strideW,
                          const int paddingH, const int paddingW,
                          real scaleA, real scaleB,
-                         real* backGrad) {
+                         real* backGrad, const int outStride) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
@@ -370,7 +376,7 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
            strideH, strideW,
            paddingH, paddingW,
            scaleA, scaleB,
-           backGrad);
+           backGrad, outStride);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 14ff8510f7b19dc24b7b1ba603485488ddd4979d..949788be497874a5bb34e49e11bdc8ba3205ba61 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -562,4 +562,39 @@ void HuberTwoClass::backwardImpIn(
   }
 }
 
+/**
+ * This cost layer compute the sum of its input as loss.
+ * \f[
+ * o(i) = \sum_{j=1}^D y_{ij}
+ * \f]
+ */
+class SumCostLayer : public Layer {
+public:
+  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    bool ret = Layer::init(layerMap, parameterMap);
+    if (!ret) return ret;
+    CHECK_EQ(inputLayers_.size(), 1UL);
+    return true;
+  }
+
+  virtual void forward(PassType passType) {
+    Layer::forward(passType);
+    const MatrixPtr& input = getInputValue(0);
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = input->getHeight();
+    int size = 1;
+    resizeOutput(batchSize, size);
+    output_.value->sumRows(*input);
+  }
+
+  virtual void backward(const UpdateCallback& callback = nullptr) {
+    getInputGrad(0)->add((real)1);
+  }
+};
+
+REGISTER_LAYER(sum_cost, SumCostLayer);
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index b464e16737ae561dce6e7d4f16a4dd61f73204e0..f263c688213ae6a83d5db4a1025aa252344dfab8 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -129,7 +129,7 @@ protected:
  * This cost layer compute Euclidean (L2) loss for real-valued regression
  * tasks.
  * \f[
- * L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
+ * L = \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
  * \f]
  */
 class SumOfSquaresCostLayer : public CostLayer {
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 7fc27ac0bd8e05246d87bac0e9692d8496f6601f..2fbc9001f11613cd987e3815f6f31caa8f9979cf 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -52,10 +52,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
 Layer* PoolLayer::create(const LayerConfig& config) {
   CHECK_EQ(config.inputs_size(), 1);
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
-  if (pool == "max-projection") {
-    return new MaxPoolProjectionLayer(config);
-  } else if (pool == "avg-projection") {
-    return new AvgPoolProjectionLayer(config);
+  if (pool == "max-projection" || pool == "avg-projection") {
+    return new PoolProjectionLayer(config);
 #ifndef PADDLE_ONLY_CPU
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9be5aba3d57d23e462c9ea3608491606f988c35f
--- /dev/null
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PoolProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION_CREATE_FUNC(pool, &PoolProjection::create);
+
+PoolProjection::PoolProjection(const ProjectionConfig& config,
+                               ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  const PoolConfig& conf = config_.pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+  sizeX_ = conf.size_x();
+  stride_ = conf.stride();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  confPadding_ = conf.padding();
+
+  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
+  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+}
+
+size_t PoolProjection::getSize() {
+  imgSizeY_ = in_->getFrameHeight();
+  imgSize_ = in_->getFrameWidth();
+  const PoolConfig& conf = config_.pool_conf();
+  if (imgSizeY_ == 0) {
+    imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  }
+  if (imgSize_ == 0) {
+    imgSize_ = conf.img_size();
+  }
+  outputY_ = outputSize(imgSizeY_, sizeY_, confPaddingY_, strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_, sizeX_, confPadding_, stride_,
+                        /* caffeMode */ false);
+
+  const_cast<Argument*>(out_)->setFrameHeight(outputY_);
+  const_cast<Argument*>(out_)->setFrameWidth(outputX_);
+
+  return outputY_ * outputX_ * channels_;
+}
+
+PoolProjection* PoolProjection::create(const ProjectionConfig& config,
+                                       ParameterPtr parameter, bool useGpu) {
+  const std::string& pool = config.pool_conf().pool_type();
+  if (pool == "max-projection") {
+    return new MaxPoolProjection(config, parameter, useGpu);
+  } else if (pool == "avg-projection") {
+    return new AvgPoolProjection(config, parameter, useGpu);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << pool;
+    return nullptr;
+  }
+}
+
+void MaxPoolProjection::forward() {
+  size_t width = getSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  MatrixPtr inputV = in_->value;
+  MatrixPtr outV = out_->value;
+  outV->maxPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_,
+                       strideY_, stride_, outputY_, outputX_, confPaddingY_,
+                       confPadding_);
+}
+
+void MaxPoolProjection::backward(const UpdateCallback& callback) {
+  (void)callback;
+  MatrixPtr outGrad = out_->grad;
+  MatrixPtr inputV = in_->value;
+  MatrixPtr outV = out_->value;
+  MatrixPtr inputGrad = in_->grad;
+
+  if (NULL == inputGrad) {
+    return;
+  }
+  inputGrad->maxPoolBackward(*inputV, imgSizeY_, imgSize_, *outGrad, *outV,
+                             sizeX_, sizeY_, strideY_, stride_, outputY_,
+                             outputX_, 1, 1, confPaddingY_, confPadding_);
+}
+
+void AvgPoolProjection::forward() {
+  size_t width = getSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  MatrixPtr inputV = in_->value;
+  MatrixPtr outV = out_->value;
+  outV->avgPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_,
+                       strideY_, stride_, outputY_, outputX_, confPaddingY_,
+                       confPadding_);
+}
+
+void AvgPoolProjection::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = out_->grad;
+  MatrixPtr inputGrad = in_->grad;
+
+  if (NULL == inputGrad) {
+    return;
+  }
+
+  inputGrad->avgPoolBackward(*outputGrad, imgSizeY_, imgSize_, sizeX_, sizeY_,
+                             strideY_, stride_, outputY_, outputX_, 1, 1,
+                             confPaddingY_, confPadding_);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..a11e25b729cb7afabdb3547326f269e54ddf42da
--- /dev/null
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+class PoolProjection : public Projection {
+protected:
+  size_t imgSizeY_, imgSize_;
+  size_t outputY_, outputX_;
+  size_t strideY_, stride_;
+  size_t sizeY_, sizeX_;
+  int confPaddingY_, confPadding_;
+  size_t channels_;
+  std::string poolType_;
+
+public:
+  PoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                 bool useGpu);
+
+  static PoolProjection* create(const ProjectionConfig& config,
+                                ParameterPtr parameter, bool useGpu);
+
+  const std::string& getPoolType() const { return poolType_; }
+
+  size_t getSize();
+};
+
+class MaxPoolProjection : public PoolProjection {
+public:
+  MaxPoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+class AvgPoolProjection : public PoolProjection {
+public:
+  AvgPoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
index 9e8ce778501bbc1f91bfad6d3ab7eb5b1b6f4c80..cabb346d6c99178f7c8ce049d495785c0a488173 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 
 namespace paddle {
 
+
 size_t PoolProjectionLayer::getSize() {
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t layerSize = 0;
@@ -37,74 +38,23 @@ size_t PoolProjectionLayer::getSize() {
 
   layerSize = outputH_ * outputW_ * channels_;
 
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
   return layerSize;
 }
 
-void MaxPoolProjectionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one ROW */
-  MatrixPtr input = getInputValue(0);
-  int batchSize = input->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-
-  outV->maxPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, sizeY_,
-                       strideY_, stride_, outputH_, outputW_, confPaddingY_,
-                       confPadding_);
-}
-
-void MaxPoolProjectionLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  /* Do derivation */
-  MatrixPtr outGrad = getOutputGrad();
-  MatrixPtr inputV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr inputGrad = getInputGrad(0);
-
-  inputGrad->maxPoolBackward(*inputV, imgSizeH_, imgSizeW_, *outGrad, *outV,
-                             sizeX_, sizeY_, strideY_, stride_, outputH_,
-                             outputW_, 1, 1, confPaddingY_, confPadding_);
-}
-
-void AvgPoolProjectionLayer::forward(PassType passType) {
+void PoolProjectionLayer::forward(PassType passType) {
   Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one ROW */
-  MatrixPtr input = getInputValue(0);
-  int batchSize = input->getHeight();
+  const Argument& in = getInput(0);
+  int batchSize = in.value->getHeight();
   int size = getSize();
   resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-
-  outV->avgPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, sizeY_,
-                       strideY_, stride_, outputH_, outputW_, confPaddingY_,
-                       confPadding_);
+  poolProjection_->forward(&in, &output_, passType);
 }
 
-void AvgPoolProjectionLayer::backward(const UpdateCallback& callback) {
+void PoolProjectionLayer::backward(const UpdateCallback& callback) {
   (void)callback;
-
   if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr outputGrad = getOutputGrad();
-  MatrixPtr inputGrad = getInputGrad(0);
-  inputGrad->avgPoolBackward(*outputGrad, imgSizeH_, imgSizeW_, sizeX_, sizeY_,
-                             strideY_, stride_, outputH_, outputW_, 1, 1,
-                             confPaddingY_, confPadding_);
+  poolProjection_->backward(callback);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 42bbc83c62246dfc8e69aa0b427b27819a701eb6..777b6f39e7cc4ebaa7078ce3378b2688363245e8 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
+#include <vector>
 #include "PoolLayer.h"
+#include "PoolProjection.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 /**
@@ -27,33 +27,18 @@ class PoolProjectionLayer : public PoolLayer {
 protected:
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
+  std::unique_ptr<PoolProjection> poolProjection_;
+  ProjectionConfig projectionConfig_;
 
 public:
-  size_t getSize();
-  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {}
-};
-/**
- * @brief A layer for max pooling
- */
-class MaxPoolProjectionLayer : public PoolProjectionLayer {
-public:
-  explicit MaxPoolProjectionLayer(const LayerConfig& config)
-      : PoolProjectionLayer(config) {}
-
-  ~MaxPoolProjectionLayer() {}
+  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
+    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
+    *conf = config_.inputs(0).pool_conf();
+    poolProjection_.reset(
+        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
+  }
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-/**
- * @brief A layer for average pooling
- */
-class AvgPoolProjectionLayer : public PoolProjectionLayer {
-public:
-  explicit AvgPoolProjectionLayer(const LayerConfig& config)
-      : PoolProjectionLayer(config) {}
-
-  ~AvgPoolProjectionLayer() {}
+  size_t getSize();
 
   virtual void forward(PassType passType);
   virtual void backward(const UpdateCallback& callback = nullptr);
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 3fa3a0cc230ac4c8616abe0eb2c8ac41bde52d53..203edc5396a53cf72dcad6308335ba4731ba49bc 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
-#include "paddle/parameter/Parameter.h"
-#include "ModelConfig.pb.h"
 #include "Layer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/parameter/Parameter.h"
 
 namespace paddle {
 
@@ -28,6 +27,11 @@ namespace paddle {
     Projection::registrar_.registerClass<__class_name>(#__type_name); \
   })
 
+#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
+  static InitFunction __reg_type_##__type_name([]() {                   \
+    Projection::registrar_.registerClass(#__type_name, createFunction); \
+  })
+
 /**
  * A projection takes one Argument as input, calculate the result and add it
  * to output Argument.
@@ -50,7 +54,8 @@ public:
       registrar_;
 
   /**
-   * Forward propagation. If backward() will be called, in and out must be kept valid until then.
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
    * @param in input of projection
    * @param out output of projection
    * @param passType PASS_TRAIN of PASS_TEST
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..582abf78c84a4e1bce87f78f2abbd01620bd1d9c
--- /dev/null
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SpatialPyramidPoolLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(spp, SpatialPyramidPoolLayer);
+
+ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
+                                                    size_t imgSizeH,
+                                                    size_t channels,
+                                                    size_t pyramidLevel,
+                                                    std::string& poolType) {
+  ProjectionConfig config;
+  config.set_type("pool");
+  PoolConfig* conf = config.mutable_pool_conf();
+  conf->set_channels(channels);
+  conf->set_img_size(imgSizeW);
+  conf->set_img_size_y(imgSizeH);
+  conf->set_pool_type(poolType);
+
+  int numBins = std::pow(2, pyramidLevel);
+
+  int sizeH = std::ceil(imgSizeH / static_cast<double>(numBins));
+  int paddingH = (sizeH * numBins - imgSizeH + 1) / 2;
+  int outSizeH = outputSize(imgSizeH, sizeH, paddingH, sizeH, true);
+
+  int sizeW = std::ceil(imgSizeW / static_cast<double>(numBins));
+  int paddingW = (sizeW * numBins - imgSizeW + 1) / 2;
+  int outSizeW = outputSize(imgSizeW, sizeW, paddingW, sizeW, true);
+
+  conf->set_stride(sizeW);
+  conf->set_stride_y(sizeH);
+  conf->set_size_x(sizeW);
+  conf->set_size_y(sizeH);
+  conf->set_padding(paddingW);
+  conf->set_padding_y(paddingH);
+  conf->set_output_x(outSizeW);
+  conf->set_output_y(outSizeH);
+  config.set_output_size(outSizeH * outSizeW * channels);
+  return config;
+}
+
+size_t SpatialPyramidPoolLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = sppConf.img_size();
+  }
+
+  size_t outputH = 1;
+  size_t outputW = (std::pow(4, pyramidHeight_) - 1) / (4 - 1);
+
+  layerSize = outputH * outputW * channels_;
+  return layerSize;
+}
+
+bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  pyramidHeight_ = sppConf.pyramid_height();
+  poolType_ = sppConf.pool_type();
+
+  channels_ = sppConf.channels();
+  imgSizeW_ = sppConf.img_size();
+  imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  poolProjections_.reserve(pyramidHeight_);
+  projCol_.reserve(pyramidHeight_);
+  projOutput_.resize(pyramidHeight_);
+
+  size_t startCol = 0;
+  size_t endCol = 0;
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    poolProjections_.emplace_back(PoolProjection::create(
+        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_), nullptr,
+        useGpu_));
+    endCol += poolProjections_[i]->getOutputSize();
+    projCol_.push_back(std::make_pair(startCol, endCol));
+    startCol = endCol;
+  }
+  CHECK_EQ(endCol, getSize());
+  return true;
+}
+
+void SpatialPyramidPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, getSize());
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    size_t startCol = projCol_[i].first;
+    size_t endCol = projCol_[i].second;
+    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
+    projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
+  }
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    poolProjections_[i]->forward(&getInput(0), &projOutput_[i], passType);
+  }
+}
+
+void SpatialPyramidPoolLayer::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    if (poolProjections_[i]) {
+      poolProjections_[i]->backward(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e15b6d2f85c6f5b9620e28aaef9c6246341611f9
--- /dev/null
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "PoolProjection.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+/**
+ * @brief A layer for spatial pyramid pooling on the input image by taking
+ * the max, average, etc. within regions, so that the result vector of
+ * different sized images are of the same size.
+ * 
+ * The config file api is spp_layer.
+ */
+
+class SpatialPyramidPoolLayer : public Layer {
+protected:
+  size_t channels_;
+  size_t imgSizeW_;
+  size_t imgSizeH_;
+  size_t pyramidHeight_;
+  std::string poolType_;
+
+  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+
+public:
+  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SpatialPyramidPoolLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  ProjectionConfig getConfig(size_t sizeX_, size_t sizeY_, size_t channels,
+                             size_t pyamidLevel_, std::string& poolType_);
+  size_t getSize();
+
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 9e9ee9eeceb928229f2e3be29c229f7a2ab14d8a..e7e07e9e69dc7a1b51211364dad7043bdcbaf4c3 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/math/MathUtils.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -981,6 +981,32 @@ TEST(Layer, PoolLayer) {
 #endif
 }
 
+void testSppLayer(const string& poolType, const int pyramidHeight, bool trans,
+                  bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("spp");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  SppConfig* sppConfig = input->mutable_spp_conf();
+  sppConfig->set_pool_type(poolType);
+  sppConfig->set_pyramid_height(pyramidHeight);
+  sppConfig->set_channels(16);
+  sppConfig->set_img_size(10);
+  sppConfig->set_img_size_y(20);
+  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
+  config.layerConfig.set_size(outputSize * sppConfig->channels());
+  testLayerGrad(config, "spp", 100, trans, useGpu);
+}
+
+TEST(Layer, SpatialPyramidPoolLayer) {
+  for (auto useGpu : {false, true}) {
+    for (auto pyramidHeight : {1, 2, 3}) {
+      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
+      testSppLayer("max-projection", pyramidHeight, false, useGpu);
+    }
+  }
+}
+
 TEST(Layer, rankCostLayer) {
   TestConfig config;
   config.layerConfig.set_type("rank-cost");
@@ -998,6 +1024,19 @@ TEST(Layer, rankCostLayer) {
   }
 }
 
+TEST(Layer, sumCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("sum_cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "sum_cost", 100, false, useGpu);
+  }
+}
+
 TEST(Layer, weightedRankCostLayer) {
   TestConfig config;
   config.layerConfig.set_type("rank-cost");
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 8a2a3791d7feda9affdec2a8155230fdf4aef99b..950c3bb6cca28ad4e9c10bc984898c9d643478c4 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -13,20 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Matrix.h"
+#include "MathFunctions.h"
 #include "SparseMatrix.h"
 #include "SparseRowMatrix.h"
-#include "MathFunctions.h"
 
-#include <cmath>
 #include <float.h>
 #include <algorithm>
+#include <cmath>
 
-#include "paddle/utils/Logging.h"
 #include <string.h>
 #include "hl_cnn.h"
 #include "hl_gpu.h"
 #include "hl_table_apply.h"
 #include "hl_top_k.h"
+#include "paddle/utils/Logging.h"
 
 #include "paddle/utils/ThreadLocal.h"
 
@@ -43,9 +43,9 @@ inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
 Matrix::Matrix(MemoryHandlePtr memHandle, size_t height, size_t width,
                bool trans, bool use_gpu)
     : BaseMatrix(
-        height, width,
-        memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-        trans, use_gpu) {
+          height, width,
+          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
+          trans, use_gpu) {
   elementCnt_ = width * height;
   memoryHandle_ = memHandle;
 }
@@ -96,7 +96,7 @@ MatrixPtr Matrix::create(MemoryHandlePtr memHandle, size_t height, size_t width,
   if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
     return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
   } else if (auto cpuHandle =
-             std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
+                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
     return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
   } else {
     LOG(FATAL) << "Wrong";
@@ -387,17 +387,17 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
 void GpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
-  GpuSparseMatrix* sMatPtr  = dynamic_cast<GpuSparseMatrix*>(&a);
+  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
   if (!sMatPtr) {
     sumCols(a, scale);
   } else {
     real* data = getData();
     hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
-    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(),
-                                width_, scale);
+    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
   }
 }
 
+
 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(a.getWidth() % getWidth(), 0UL);
@@ -453,8 +453,8 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
   hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, dimK,
-                scaleAB, scaleT, lda, ldb, ldc);
+  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, dimK, scaleAB,
+                scaleT, lda, ldb, ldc);
 }
 
 void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
@@ -475,8 +475,8 @@ void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_sparse_matrix_s A_d = a.sMatrix_.get();
   real* B_d = b.data_;
   real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d, transA, B_d, HPPL_OP_N, C_d, height_,
-                          width_, b.height_, scaleAB, scaleT);
+  hl_matrix_csr_mul_dense(A_d, transA, B_d, HPPL_OP_N, C_d, height_, width_,
+                          b.height_, scaleAB, scaleT);
 }
 
 void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
@@ -497,11 +497,11 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
         << "Matrix dimensions are not equal";
   }
   if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d, HPPL_OP_N, B_d, transB, C_d, height_,
-                            width_, a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csc(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_,
+                            a.width_, scaleAB, scaleT);
   } else {
-    hl_matrix_dense_mul_csr(A_d, HPPL_OP_N, B_d, transB, C_d, height_,
-                            width_, a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csr(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_,
+                            a.width_, scaleAB, scaleT);
   }
 }
 
@@ -563,8 +563,8 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_select_rows(a, stride_, table.getData(), table.stride_,
-                        index, numSamples, tableSize, dim);
+  hl_matrix_select_rows(a, stride_, table.getData(), table.stride_, index,
+                        numSamples, tableSize, dim);
 #endif
 }
 
@@ -581,8 +581,8 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_add_to_rows(table.getData(), table.stride_, a, stride_,
-                        index, numSamples, tableSize, dim);
+  hl_matrix_add_to_rows(table.getData(), table.stride_, a, stride_, index,
+                        numSamples, tableSize, dim);
 #endif
 }
 
@@ -617,13 +617,8 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
 
-  hl_matrix_top_k(maxVal.getData(),
-                  maxVal.getStride(),
-                  maxIds.getData(),
-                  this->getData(),
-                  this->getStride(),
-                  this->getWidth(),
-                  beam,
+  hl_matrix_top_k(maxVal.getData(), maxVal.getStride(), maxIds.getData(),
+                  this->getData(), this->getStride(), this->getWidth(), beam,
                   numSamples);
 #endif
 }
@@ -647,12 +642,12 @@ void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
 
   size_t size = getWidth();
   size_t batchSize = getHeight();
-  const real* input  = a.getData();
+  const real* input = a.getData();
   real* output = getData();
   int* idForGpu = id.getData();
 
-  hl_maxout_forward(input, output, idForGpu, batchSize, size,
-                    size / channels, groups);
+  hl_maxout_forward(input, output, idForGpu, batchSize, size, size / channels,
+                    groups);
 }
 
 void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
@@ -663,12 +658,12 @@ void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
 
   size_t size = a.getWidth();
   size_t batchSize = getHeight();
-  real* input  = getData();
+  real* input = getData();
   const real* output = a.getData();
   const int* idForGpu = id.getData();
 
-  hl_maxout_backward(input, output, idForGpu, batchSize, size,
-                     size / channels, groups);
+  hl_maxout_backward(input, output, idForGpu, batchSize, size, size / channels,
+                     groups);
 }
 
 /*calulate the error of classification */
@@ -684,8 +679,8 @@ void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
   real* recResult_d = data_;
   int* label_d = label_ptr->getData();
 
-  hl_matrix_classification_error(output_d, label_d, recResult_d,
-                                 height_, output_ptr->width_);
+  hl_matrix_classification_error(output_d, label_d, recResult_d, height_,
+                                 output_ptr->width_);
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -754,8 +749,7 @@ void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
   real* outputData = output.getData();
   auto starts = index.getData();
   int numSequences = index.getSize() - 1;
-  hl_sequence_softmax_forward(inputData, outputData,
-                              starts, numSequences);
+  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
 }
 
 void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
@@ -769,8 +763,7 @@ void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
   real* output_d = output.data_;
   real* sftmaxSum_d = sftmaxSum.data_;
   real* grad_d = data_;
-  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_,
-                               width_);
+  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
 }
 
 void GpuMatrix::softmaxBackward(Matrix& outputV) {
@@ -821,7 +814,7 @@ void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
 }
 void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
-  << "Matrix type are not equal";
+      << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t dim = output1.getWidth();
   CHECK_EQ(getWidth(), 1UL);
@@ -830,15 +823,15 @@ void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   real* out = getData();
   real* x = output1.getData();
   real* y = output2.getData();
-  hl_cossim(out, x, y,
-      dim, output1.getHeight(), output2.getHeight(), scale);
+  hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
 }
 void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
                                  Matrix& prevOut2, Matrix& prevGrad1,
                                  Matrix& prevGrad2, real scale) {
   CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
         prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
-        prevGrad2.useGpu_ == true) << "Matrix type are not equal";
+        prevGrad2.useGpu_ == true)
+      << "Matrix type are not equal";
   CHECK_EQ(getWidth(), 1UL);
   CHECK_EQ(output.getWidth(), 1UL);
 
@@ -858,9 +851,8 @@ void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
   real* prevOutY = prevOut2.getData();
   real* prevGradX = prevGrad1.getData();
   real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad, out, prevOutX, prevOutY,
-      prevGradX, prevGradY, dim,
-      prevOut1.getHeight(), prevOut2.getHeight(), scale);
+  hl_cossim_derivative(grad, out, prevOutX, prevOutY, prevGradX, prevGradY, dim,
+                       prevOut1.getHeight(), prevOut2.getHeight(), scale);
 }
 
 void GpuMatrix::randomizeUniform() {
@@ -911,8 +903,8 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
 
 void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
                            int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW,
-                           int outputH, int outputW) {
+                           int strideW, int paddingH, int paddingW, int outputH,
+                           int outputW) {
   CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
 
   CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
@@ -922,17 +914,16 @@ void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
   size_t elemCnt = outputH * outputW * blockH * blockW * channels;
   CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
 
-  hl_expand_feature2col(feature.getData(), channels, feaImgHeight,
-                        feaImgWidth, blockH, blockW, strideH, strideW,
-                        paddingH, paddingW, outputH, outputW,
-                        getData());
+  hl_expand_feature2col(feature.getData(), channels, feaImgHeight, feaImgWidth,
+                        blockH, blockW, strideH, strideW, paddingH, paddingW,
+                        outputH, outputW, getData());
 }
 
 void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
                            int thisImgWidth, int channels, int blockH,
                            int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW,
-                           real alpha, real beta) {
+                           int paddingW, int outputH, int outputW, real alpha,
+                           real beta) {
   CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
            getHeight() * getWidth())
@@ -941,18 +932,17 @@ void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
   size_t elemCnt = outputH * outputW * blockW * blockH * channels;
   CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
       << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(
-      expandFeat.getData(), channels, thisImgHeight, thisImgWidth, blockH,
-      blockW, strideH, strideW, paddingH, paddingW, outputH, outputW,
-      getData(), alpha, beta);
+  hl_shrink_col2feature(expandFeat.getData(), channels, thisImgHeight,
+                        thisImgWidth, blockH, blockW, strideH, strideW,
+                        paddingH, paddingW, outputH, outputW, getData(), alpha,
+                        beta);
 }
 
 void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+                               size_t imgSizeW, size_t channels, size_t sizeX,
+                               size_t sizeY, size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW, size_t paddingH,
+                               size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -963,16 +953,15 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_maxpool_forward(frameNum, inputData, channels, height, width,
-                     outputH, outputW, sizeX, sizeY, strideH, strideW,
-                     paddingH, paddingW, data_);
+  hl_maxpool_forward(frameNum, inputData, channels, height, width, outputH,
+                     outputW, sizeX, sizeY, strideH, strideW, paddingH,
+                     paddingW, data_, getStride());
 }
 
 void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
                                 size_t imgSizeW, Matrix& outGrad, Matrix& outV,
-                                size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
+                                size_t sizeX, size_t sizeY, size_t strideH,
+                                size_t strideW, size_t outputH, size_t outputW,
                                 real scaleTargets, real scaleOutput,
                                 size_t paddingH, size_t paddingW) {
   CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
@@ -992,19 +981,17 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
   CHECK(outGrad.getHeight() == outV.getHeight() &&
         outGrad.getWidth() == outV.getWidth());
 
-
-  hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels,
-                      height, width, outputH, outputW, sizeX, sizeY,
-                      strideH, strideW, paddingH, paddingW,
-                      scaleTargets, scaleOutput, data_);
+  hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels, height,
+                      width, outputH, outputW, sizeX, sizeY, strideH, strideW,
+                      paddingH, paddingW, scaleTargets, scaleOutput, data_,
+                      outGrad.getStride());
 }
 
 void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+                               size_t imgSizeW, size_t channels, size_t sizeX,
+                               size_t sizeY, size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW, size_t paddingH,
+                               size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -1015,18 +1002,17 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_avgpool_forward(frameNum, inputData, channels, height, width,
-                     outputH, outputW, sizeX, sizeY,
-                     strideH, strideW,
-                     paddingH, paddingW, data_);
+  hl_avgpool_forward(frameNum, inputData, channels, height, width, outputH,
+                     outputW, sizeX, sizeY, strideH, strideW, paddingH,
+                     paddingW, data_, getStride());
 }
 
 void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
                                 size_t imgSizeW, size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+                                size_t strideH, size_t strideW, size_t outputH,
+                                size_t outputW, real scaleTargets,
+                                real scaleOutput, size_t paddingH,
+                                size_t paddingW) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
@@ -1038,11 +1024,10 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
   CHECK(height_ == outGrad.getHeight());
   CHECK(outGrad.getWidth() == outputH * outputW * channels);
 
-  hl_avgpool_backward(frameNum, outDiff, channels, height, width,
-                      outputH, outputW, sizeX, sizeY,
-                      strideH, strideW, paddingH, paddingW,
-                      scaleTargets, scaleOutput,
-                      data_);
+  hl_avgpool_backward(frameNum, outDiff, channels, height, width, outputH,
+                      outputW, sizeX, sizeY, strideH, strideW, paddingH,
+                      paddingW, scaleTargets, scaleOutput, data_,
+                      outGrad.getStride());
 }
 
 void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
@@ -1057,8 +1042,8 @@ void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
   CHECK(denoms.getHeight() == input.getHeight() &&
         denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
         input.getWidth() == width_);
-  hl_CMRNorm_forward(num, input.getData(), denoms.getData(), data_,
-                     channels, height, width, sizeX, scale, -pow);
+  hl_CMRNorm_forward(num, input.getData(), denoms.getData(), data_, channels,
+                     height, width, sizeX, scale, -pow);
 }
 
 void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
@@ -1078,13 +1063,11 @@ void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
         denoms.getWidth() == localGrad.getWidth());
 
   hl_CMRNorm_backward(num, preOutV.getData(), denoms.getData(),
-                      localOutV.getData(), localGrad.getData(), data_,
-                      channels, height, width, sizeX, -pow,
-                      2.0f * pow * scale);
+                      localOutV.getData(), localGrad.getData(), data_, channels,
+                      height, width, sizeX, -pow, 2.0f * pow * scale);
 }
 
-void GpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
+void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
                                    IVector& index) {
   CHECK(dynamic_cast<GpuMatrix*>(&input));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1101,12 +1084,11 @@ void GpuMatrix::maxSequenceForward(Matrix& input,
   CHECK_EQ(numSequences, sequence.getSize() - 1);
   CHECK_EQ(numSequences * dim, index.getSize());
 
-  hl_max_sequence_forward(inputData, starts, outData, maxIndex,
-                                    numSequences, dim);
+  hl_max_sequence_forward(inputData, starts, outData, maxIndex, numSequences,
+                          dim);
 }
 
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
+void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
                                     IVector& index) {
   CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1163,9 +1145,8 @@ void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
   real* inGrad = inputGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_data(outGrad, starts, inGrad,
-                                      numSequences, inputDim,
-                                      contextLength, contextStart);
+  hl_context_projection_backward_data(outGrad, starts, inGrad, numSequences,
+                                      inputDim, contextLength, contextStart);
 }
 
 void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
@@ -1185,9 +1166,9 @@ void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
   real* wtGrad = weightGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_weight(
-      outGrad, starts, wtGrad, numSequences, weightDim, totalPad, contextLength,
-      contextStart, beginPad);
+  hl_context_projection_backward_weight(outGrad, starts, wtGrad, numSequences,
+                                        weightDim, totalPad, contextLength,
+                                        contextStart, beginPad);
 }
 
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
@@ -1199,8 +1180,7 @@ void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
   real* output = getData();
-  hl_param_relu_forward(output, input, w, numElements, numSamples,
-      partial_sum);
+  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
@@ -1212,8 +1192,8 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
-  hl_param_relu_backward_w(wgrad, ograd, input,
-      numElements, numSamples, partial_sum);
+  hl_param_relu_backward_w(wgrad, ograd, input, numElements, numSamples,
+                           partial_sum);
 }
 
 void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
@@ -1224,8 +1204,8 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
-  hl_param_relu_backward_diff(ograd, input, w, diff,
-      numElements, numSamples, partial_sum);
+  hl_param_relu_backward_diff(ograd, input, w, diff, numElements, numSamples,
+                              partial_sum);
 }
 
 void GpuMatrix::addColumnVector(const Matrix& b) {
@@ -1571,8 +1551,8 @@ void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
 
 void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
                            int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW,
-                           int outputH, int outputW) {
+                           int strideW, int paddingH, int paddingW, int outputH,
+                           int outputW) {
   CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
 
   CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
@@ -1612,8 +1592,8 @@ void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
 void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
                            int thisImgWidth, int channels, int blockH,
                            int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW,
-                           real alpha, real beta) {
+                           int paddingW, int outputH, int outputW, real alpha,
+                           real beta) {
   CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
            getHeight() * getWidth())
@@ -1650,11 +1630,10 @@ void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
 }
 
 void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+                               size_t imgSizeW, size_t channels, size_t sizeX,
+                               size_t sizeY, size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW, size_t paddingH,
+                               size_t paddingW) {
   real* inputData = inputMat.getData();
   real* outData = data_;
   size_t num = inputMat.getHeight();
@@ -1662,15 +1641,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   size_t inHeight = imgSizeH;
   CHECK(inHeight * inWidth == inputMat.getWidth() / channels);
   CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels*outputH*outputW, this->getWidth());
+  CHECK_EQ(channels * outputH * outputW, this->getWidth());
+  size_t outStride = getStride();
 
   /* initialize the data_ */
-  for (size_t i = 0; i < height_ * width_; i++) {
-    outData[i] = -(real)FLT_MAX;
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[i * outStride + j] = -(real)FLT_MAX;
+    }
   }
 
   /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {         // frame by frame
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = data_ + n * outStride;
+    }
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1712,7 +1697,16 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
   real* inData = image.getData();
   real* otData = outV.getData();
   real* otGrad = outGrad.getData();
+
+  size_t outStride = outV.getStride();
+  real* origOutData = otData;
+  real* origOutGrad = otGrad;
+
   for (size_t n = 0; n < num; ++n) {
+    if (!outV.isContiguous()) {
+      otData = origOutData + n * outStride;
+      otGrad = origOutGrad + n * outStride;
+    }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1743,9 +1737,9 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
 
 void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
                                size_t channels, size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+                               size_t strideH, size_t strideW, size_t outputH,
+                               size_t outputW, size_t paddingH,
+                               size_t paddingW) {
   // The main loop
   size_t num = input.getHeight();
   size_t inHeight = imgSizeH;
@@ -1756,6 +1750,9 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   real* inData = input.getData();
 
   for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1787,9 +1784,8 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
 }
 
 void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                                size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
+                                size_t sizeX, size_t sizeY, size_t strideH,
+                                size_t strideW, size_t outputH, size_t outputW,
                                 real scaleTargets, real scaleOutput,
                                 size_t paddingH, size_t paddingW) {
   size_t num = input.getHeight();
@@ -1799,6 +1795,9 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   real* outData = getData();
 
   for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1901,8 +1900,7 @@ void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
  * Output: output size is the number of input sequences (NOT input instances).
  * output[i] is set to max_{for each instance in this sequence}{input[i]}
  */
-void CpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
+void CpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
                                    IVector& index) {
   CHECK(dynamic_cast<CpuMatrix*>(&input));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -1943,8 +1941,7 @@ void CpuMatrix::maxSequenceForward(Matrix& input,
   }
 }
 
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
+void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
                                     IVector& index) {
   CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -2776,7 +2773,7 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
       blockSeq.push_back(k);
     }
     std::shuffle(blockSeq.begin(), blockSeq.end(),
-        ThreadLocalRandomEngine::get());
+                 ThreadLocalRandomEngine::get());
   }
   std::vector<int>& localBufRows = *localBufRows_;
   int* cols = a->getCols();
@@ -3007,7 +3004,7 @@ void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
   size_t size = getWidth();
   size_t batchSize = getHeight();
   size_t featLen = size / channels;
-  const real* input  = a.getData();
+  const real* input = a.getData();
   int* idForCpu = id.getData();
 
   MatrixPtr maxInMat, maxOutMat;
@@ -3041,8 +3038,8 @@ void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
   size_t batchSize = getHeight();
   size_t featLen = size / channels;
   size_t newFeatLen = groups * featLen;
-  real* inputG  = getData();
-  const real* outG  = a.getData();
+  real* inputG = getData();
+  const real* outG = a.getData();
   int* idForCpu = id.getData();
 
   for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
@@ -3266,9 +3263,9 @@ void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
   CHECK(isContiguous());
 
   MatrixPtr inTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                 /* trans= */ false, false);
+                                   /* trans= */ false, false);
   MatrixPtr outTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                 /* trans= */ false, false);
+                                    /* trans= */ false, false);
   size_t numSequences = index.getSize() - 1;
   auto starts = index.getData();
   for (size_t i = 0; i < numSequences; ++i) {
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 2cdff9d1aca927122fcdb0c2a7ab22a0e38b41c1..b16d4314654ffeab74137ec1ee69203dab56d851 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -378,7 +378,7 @@ hl_activation_mode_t hlActiveType(const std::string& type) {
     return HL_ACTIVATION_RELU;
   } else if (type == "tanh") {
     return HL_ACTIVATION_TANH;
-  } else if (type == "linear") {
+  } else if (type == "linear" || type == "") {
     return HL_ACTIVATION_LINEAR;
   } else {
     LOG(FATAL) << "Do not support activation type " << type;
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index 479b457e55a7dac58ff390cce8d67d46da3b474d..aea77248cbac0f3ee044b05894d37718e692a0fc 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -120,6 +120,14 @@ message PoolConfig {
   optional uint32 padding_y = 13 [default = 0];
 }
 
+message SppConfig {
+  required string pool_type = 1;
+  required uint32 pyramid_height = 2;
+  required uint32 channels = 3;
+  required uint32 img_size = 4;
+  optional uint32 img_size_y = 5;
+}
+
 message NormConfig {
   // rnorm or cmrnorm
   required string norm_type = 1;
@@ -196,6 +204,9 @@ message ProjectionConfig {
 
   // For IdentityOffsetProjection
   optional uint64 offset = 11 [default = 0];
+
+  // For pool
+  optional PoolConfig pool_conf = 12;
 }
 
 message OperatorConfig {
@@ -245,6 +256,7 @@ message LayerInputConfig {
   optional string input_layer_argument = 9;
   optional BilinearInterpConfig bilinear_interp_conf = 10;
   optional MaxOutConfig maxout_conf = 11;
+  optional SppConfig spp_conf = 12;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index c55579c960ecc3f3640d2cb6e77641f0f14a3328..eec978e1faf48805c70f25bdd55e4183fddb2fcc 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -218,7 +218,7 @@ def Inputs(*args):
 
 @config_func
 def HasInputsSet():
-    return len(g_config.model_config.input_layer_names) != 0
+    return len(g_current_submodel.input_layer_names) != 0
 
 
 # Define the name of the output layers of the NeuralNetwork.
@@ -471,6 +471,7 @@ class Input(Cfg):
             image=None,
             block_expand=None,
             maxout=None,
+            spp=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -671,7 +672,6 @@ class ConvProjection(Projection):
     def calc_parameter_dims(self, input_size, output_size):
         return None
 
-
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -795,6 +795,17 @@ class Pool(Cfg):
             padding = None,
             padding_y = None):
         self.add_keys(locals())
+        
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class SpatialPyramidPool(Cfg):
+    def __init__(
+            self,
+            pool_type,
+            pyramid_height,
+            channels,
+            img_width = None):
+        self.add_keys(locals())
 
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
@@ -1081,6 +1092,22 @@ def parse_pool(pool, input_layer_name, pool_conf):
         pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                              pool_conf.padding_y, pool_conf.stride_y, False)
 
+def parse_spp(spp, input_layer_name, spp_conf):
+    spp_conf.pool_type = spp.pool_type
+    config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
+                  "pool-type %s is not in " "['max-projection', 'avg-projection']"
+                  % spp.pool_type)
+    spp_conf.pyramid_height = spp.pyramid_height
+    spp_conf.channels = spp.channels
+
+    img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels
+
+    spp_conf.img_size = default(spp.img_width, int(img_pixels ** 0.5))
+    spp_conf.img_size_y = img_pixels / spp_conf.img_size
+    config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels,
+                  "Incorrect input image size %d for input image pixels %d"
+                  % (spp_conf.img_size, img_pixels))
+
 def parse_image(image, input_layer_name, image_conf):
     image_conf.channels = image.channels
     image_pixels = g_layer_map[input_layer_name].size / image_conf.channels
@@ -1170,14 +1197,14 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
         block_expand_conf.output_x = 0
     else:
         block_expand_conf.output_x = cnn_output_size(
-            block_expand.img_size_x, block_expand.block_x, 
+            block_expand.img_size_x, block_expand.block_x,
             block_expand.padding_x, block_expand.stride_x, False)
 
     if block_expand_conf.img_size_y == 0:
         block_expand_conf.output_y = 0
     else:
         block_expand_conf.output_y = cnn_output_size(
-            block_expand.img_size_y, block_expand.block_y, 
+            block_expand.img_size_y, block_expand.block_y,
             block_expand.padding_y, block_expand.stride_y, False)
 
 def parse_maxout(maxout, input_layer_name, maxout_conf):
@@ -1185,7 +1212,7 @@ def parse_maxout(maxout, input_layer_name, maxout_conf):
     maxout_conf.groups = maxout.groups
     maxout_conf.img_size_x = maxout.img_size_x
     maxout_conf.img_size_y = maxout.img_size_y
-    
+
 # Define an evaluator
 @config_func
 def Evaluator(
@@ -1756,6 +1783,25 @@ class PoolLayer(LayerBase):
                 name, pool_conf.output_y, pool_conf.output_x))
             self.set_layer_size((pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)
 
+@config_layer('spp')
+class SpatialPyramidPoolLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(SpatialPyramidPoolLayer, self).__init__(name, 'spp', 0, inputs=inputs, device=device)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            parse_spp(
+                self.inputs[input_index].spp,
+                input_layer.name,
+                self.config.inputs[input_index].spp_conf)
+            spp_conf = self.config.inputs[input_index].spp_conf
+            output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
+            print("output size for %s is %d " % (name, output_size))
+            self.set_layer_size(output_size * spp_conf.channels)
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
@@ -1881,7 +1927,7 @@ class MaxOutLayer(LayerBase):
                      self.config.inputs[0].maxout_conf)
         maxout_conf = self.config.inputs[0].maxout_conf
         self.set_layer_size(g_layer_map[input_layer.name].size / maxout_conf.groups)
-            
+
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -1903,6 +1949,7 @@ define_cost('SumOfSquaresCostLayer', 'square_error')
 define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
 define_cost('HuberTwoClass', 'huber')
+define_cost('SumCost', 'sum_cost')
 
 @config_layer('hsigmoid')
 class HierarchicalSigmoidLayer(LayerBase):
@@ -3015,7 +3062,7 @@ def Layer(
     layer_func = layers.get(type)
     config_assert(layer_func,
                   "layer type '%s' not supported." % type)
-    layer_func(name, **xargs)
+    return layer_func(name, **xargs)
 
 @config_func
 def ParameterHook(
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
index 451b9ac3396eadf9fab2b5fd940a6f924e042976..adebebba2523f851507c4a0525eeaae9cfeb9dcc 100644
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -20,3 +20,6 @@ from layers import *
 from networks import *
 from optimizers import *
 from attrs import *
+
+# This will enable operator overload for LayerOutput
+import math
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index ad5cdc0a0eb13f7a58e7d89ebfb79d33a63b75d5..2202d0bf96976d5ca694f1417af5da9c31eaa9f0 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -23,9 +23,9 @@ __all__ = ["TanhActivation", "SigmoidActivation",
 
 class BaseActivation(object):
     """
-    A mark for activation class. 
+    A mark for activation class.
     Each activation inherit BaseActivation, which has two parameters.
-     
+
     :param name: activation name in paddle config.
     :type name: basestring
     :param support_hppl: True if supported by hppl. HPPL is a library used by paddle
@@ -194,7 +194,7 @@ class SquareActivation(BaseActivation):
 class ExpActivation(BaseActivation):
     """
     Exponential Activation.
-    
+
     .. math::
        f(z) = e^z.
     """
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 59822180883c9a41990e67625c6c75952e7c34b7..82c57e7f90ad53aa91ea4b4e4afe8b8308bbedc8 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -31,6 +31,7 @@ import copy
 
 __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            "identity_projection", "dotmul_projection", "dotmul_operator",
+           "repeat_layer",
            "table_projection", "mixed_layer", "data_layer",
            "embedding_layer", "fc_layer", "grumemory",
            "pooling_layer", "lstmemory", "last_seq", "first_seq",
@@ -52,10 +53,11 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
            'nce_layer',
            'cross_entropy_with_selfnorm', 'cross_entropy',
-           'multi_binary_label_cross_entropy',
+           'multi_binary_label_cross_entropy', 'sum_cost',
            'rank_cost', 'lambda_cost', 'huber_cost',
            'block_expand_layer',
-           'maxout_layer', 'out_prod_layer', 'print_layer'
+           'maxout_layer', 'out_prod_layer', 'print_layer', 
+           'spp_layer', 
            ]
 
 
@@ -99,6 +101,7 @@ class LayerType(object):
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
     OUT_PROD_LAYER = 'out_prod'
+    FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
     MEMORY = 'memory'
     MAXID_LAYER = 'maxid'
@@ -113,6 +116,7 @@ class LayerType(object):
     LINEAR_COMBINATION_LAYER = "convex_comb"
     BLOCK_EXPAND = "blockexpand"
     MAXOUT = "maxout"
+    SPP_LAYER = "spp"
 
     PRINT_LAYER = "print"
 
@@ -128,6 +132,7 @@ class LayerType(object):
     CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm"
     SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
     MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
+    SUM_COST = "sum_cost"
 
     @staticmethod
     def is_layer_type(type_name):
@@ -181,6 +186,7 @@ class LayerOutput(object):
                  reverse=None):
         assert isinstance(name, basestring)
         assert isinstance(layer_type, basestring)
+        assert size is not None
         assert LayerType.is_layer_type(layer_type)
         self.name = name
         self.layer_type = layer_type
@@ -873,6 +879,7 @@ def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
                        size=input.size)
 
 
+
 @wrap_bias_attr_default()
 @wrap_param_attr_default()
 @wrap_act_default(param_names=['gate_act'],
@@ -1209,6 +1216,48 @@ def expand_layer(input, expand_as,
                        parents=[input, expand_as])
 
 
+@wrap_name_default()
+@layer_support()
+def repeat_layer(input, num_repeats,
+                 name=None,
+                 layer_attr=None):
+    """
+    A layer for repeating the input for num_repeats times. This is equivalent
+    to apply concat_layer() with num_repeats same input.
+
+    .. math::
+       y  = [x, x, \cdots, x]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       expand = repeat_layer(layer, 4)
+
+    :param input: Input layer
+    :type input: LayerOutput
+    :param num_repeats: Repeat the input so many times
+    :type num_repeats: int
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    l = Layer(
+        inputs=[input.name],
+        name=name,
+        num_filters=num_repeats,
+        type=LayerType.FEATURE_MAP_EXPAND_LAYER,
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name=name,
+                       size=l.config.size,
+                       layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
+                       parents=[input])
+
 @wrap_name_default()
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
@@ -1296,7 +1345,7 @@ def bilinear_interp_layer(input,
     assert out_size_x > 0 and out_size_y > 0
     assert input.num_filters is not None
     num_channels = input.num_filters
-    Layer(name=name,
+    l = Layer(name=name,
           inputs=Input(input.name,
                        bilinear_interp=BilinearInterp(out_size_x=out_size_x,
                                                       out_size_y=out_size_y,
@@ -1304,7 +1353,7 @@ def bilinear_interp_layer(input,
           type=LayerType.BILINEAR_INTERP_LAYER,
           **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(name, LayerType.BILINEAR_INTERP_LAYER, parents=[input],
-           num_filters=num_channels)
+                       num_filters=num_channels, size=l.config.size)
 
 @wrap_name_default()
 @layer_support()
@@ -1482,7 +1531,7 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
             inputs=[a.name, b.name],
             **ExtraLayerAttribute.to_kwargs(layer_attr)
         )
-    return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b])
+    return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
 
 
 @wrap_name_default()
@@ -1545,7 +1594,7 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None,
     ipts_for_layer.append(label.name)
     parents.append(label)
 
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.HSIGMOID,
         num_classes=num_classes,
@@ -1553,7 +1602,8 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None,
         inputs=ipts_for_layer,
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
-    return LayerOutput(name, LayerType.HSIGMOID, parents=parents)
+    return LayerOutput(name, LayerType.HSIGMOID, parents=parents,
+                       size=l.config.size)
 
 
 @wrap_name_default("conv")
@@ -1671,7 +1721,7 @@ def img_conv_layer(input, filter_size, num_filters,
     
     lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
     
-    Layer(
+    l = Layer(
         name=name,
         inputs=Input(input.name, conv=Conv(
             filter_size=filter_size, padding=padding, stride=stride,
@@ -1687,7 +1737,8 @@ def img_conv_layer(input, filter_size, num_filters,
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     return LayerOutput(name, lt, parents=[input],
-                       activation=act, num_filters=num_filters)
+                       activation=act, num_filters=num_filters,
+                       size=l.config.size)
 
 
 @wrap_name_default("pool")
@@ -1718,7 +1769,7 @@ def img_pool_layer(input, pool_size, name=None,
     :type pool_size_y: int|None
     :param num_channels: number of input channel.
     :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AveragePooling. Default is
+    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
                       MaxPooling.
     :type pool_type: BasePoolingType
     :param stride: stride width of pooling.
@@ -1750,7 +1801,7 @@ def img_pool_layer(input, pool_size, name=None,
     stride_y = stride if stride_y is None else stride_y
     padding_y = padding if padding_y is None else padding_y
 
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.POOL_LAYER,
         inputs=[Input(input.name,
@@ -1769,6 +1820,62 @@ def img_pool_layer(input, pool_size, name=None,
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     return LayerOutput(name, LayerType.POOL_LAYER, parents=[input],
+                       num_filters=num_channels, size=l.config.size)
+
+
+@wrap_name_default("spp")
+@layer_support()
+def spp_layer(input, name=None, num_channels=None, pool_type=None,
+              pyramid_height=None, img_width=None, layer_attr=None):
+    pass
+    """
+    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
+    The details please refer to
+    `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
+
+    :param name: layer name.
+    :type name: basestring
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :param pool_type: Pooling type. MaxPooling or AveragePooling. Default is MaxPooling.
+    :type scale: BasePoolingType
+    :param pyramid_height: pyramid height.
+    :type pyramid_height: int
+    :param img_width: the width of input feature map. If it is None, the input feature
+                      map should be square.
+    :type img_width: int|None
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if pool_type is None:
+        pool_type = MaxPooling()
+    elif isinstance(pool_type, AvgPooling):
+        pool_type.name = 'avg'
+
+    type_name = pool_type.name
+    if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)):
+        type_name += '-projection'
+
+    Layer(
+        name=name,
+        type=LayerType.SPP_LAYER,
+        inputs=Input(input.name,
+                     spp=SpatialPyramidPool(pool_type=type_name,
+                                            channels=num_channels,
+                                            pyramid_height=pyramid_height,
+                                            img_width=img_width)
+        ),
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.SPP_LAYER, parents=[input], 
                        num_filters=num_channels)
 
 
@@ -1778,7 +1885,7 @@ def __img_norm_layer__(name, input, size, norm_type, scale, power,
         assert input.num_filters is not None
         num_channels = input.num_filters
 
-    Layer(
+    l = Layer(
         name=name, type=LayerType.NORM_LAYER, inputs=Input(
             input.name, norm=Norm(norm_type=norm_type,
                                   channels=num_channels, size=size,
@@ -1788,7 +1895,8 @@ def __img_norm_layer__(name, input, size, norm_type, scale, power,
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     return LayerOutput(name, layer_type=LayerType.NORM_LAYER, parents=[input],
-                       num_filters=num_channels, img_norm_type=norm_type)
+                       num_filters=num_channels, img_norm_type=norm_type,
+                       size=l.config.size)
 
 
 @wrap_name_default("crmnorm")
@@ -1913,7 +2021,7 @@ def batch_norm_layer(input, act=None, name=None, num_channels=None,
             num_channels = input.size
     assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
            (batch_norm_type == "cudnn_batch_norm")
-    Layer(
+    l = Layer(
         name=name,
         inputs=Input(input.name,
                      image=Image(channels=num_channels),
@@ -1929,7 +2037,8 @@ def batch_norm_layer(input, act=None, name=None, num_channels=None,
 
     return LayerOutput(name=name, layer_type=LayerType.BATCH_NORM_LAYER,
                        parents=[input], activation=act,
-                       num_filters=num_channels)
+                       num_filters=num_channels,
+                       size=l.config.size)
 
 
 @wrap_name_default()
@@ -2034,7 +2143,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
         if each_input.num_filters is not None:
             num_filters = each_input.num_filters
 
-    Layer(
+    l = Layer(
         name=name, type=LayerType.ADDTO_LAYER, inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
         active_type=act.name,
@@ -2042,7 +2151,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
     )
 
     return LayerOutput(name, LayerType.ADDTO_LAYER, parents=input,
-                       activation=act, num_filters=num_filters)
+                       activation=act, num_filters=num_filters,
+                       size=l.config.size)
 
 
 @wrap_act_default(act=IdentityActivation())
@@ -2651,13 +2761,14 @@ def maxid_layer(input, name=None, layer_attr=None):
     """
 
     assert isinstance(input, LayerOutput)
-    Layer(name=name,
+    l = Layer(name=name,
           type='maxid',
           inputs=[input.name],
           **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(name=name,
                        layer_type=LayerType.MAXID_LAYER,
-                       parents=[input])
+                       parents=[input],
+                       size=l.config.size)
 
 
 @wrap_name_default()
@@ -2686,13 +2797,14 @@ def out_prod_layer(input1, input2, name=None, layer_attr=None):
 
     assert isinstance(input1, LayerOutput)
     assert isinstance(input2, LayerOutput)
-    Layer(name=name,
+    l = Layer(name=name,
           type=LayerType.OUT_PROD_LAYER,
           inputs=[input1.name, input2.name],
           **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(name=name,
                        layer_type=LayerType.OUT_PROD_LAYER,
-                       parents=[input1, input2])
+                       parents=[input1, input2],
+                       size=l.config.size)
 
 
 @wrap_name_default()
@@ -2721,13 +2833,14 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(name=name,
+    l = Layer(name=name,
           type=LayerType.EOSID_LAYER,
           eos_id=eos_id,
           inputs=[input.name],
           **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(name=name, layer_type=LayerType.EOSID_LAYER,
-                       parents=[input])
+                       parents=[input],
+                       size=l.config.size)
 
 
 @wrap_name_default()
@@ -2892,7 +3005,7 @@ def regression_cost(input, label, weight=None, name=None,
 
     Layer(inputs=ipts, type="square_error", name=name,
           **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.COST, parents=parents)
+    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
 @wrap_name_default("cost")
@@ -2944,7 +3057,7 @@ def classification_cost(input, label, weight=None, name=None,
     for each_evaluator in evaluator:
         __add_evaluator__(each_evaluator)
 
-    return LayerOutput(name, LayerType.COST, parents=parents)
+    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
 def conv_operator(img, filter, filter_size, num_filters,
@@ -3326,13 +3439,14 @@ def sampling_id_layer(input, name=None, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.SAMPLING_ID_LAYER,
         inputs=[Input(input.name)],
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
-    return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input)
+    return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input,
+                       size=l.config.size)
 
 
 @wrap_name_default()
@@ -3373,7 +3487,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0,
         inputs=[Input(input.name)],
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
-    return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input)
+    return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input,
+                       size=input.size)
 
 
 @wrap_name_default()
@@ -3512,7 +3627,7 @@ def block_expand_layer(input,
     if num_channels is None:
         assert input.num_filters is not None
         num_channels = input.num_filters
-    Layer(name=name,
+    l = Layer(name=name,
           inputs=Input(input.name,
                        block_expand=BlockExpand(channels=num_channels,
                                                 block_x=block_x,
@@ -3525,7 +3640,8 @@ def block_expand_layer(input,
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
-    return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input])
+    return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input],
+                       size=l.config.size)
 
 
 @wrap_name_default()
@@ -3586,13 +3702,14 @@ def maxout_layer(input,
         assert input.num_filters is not None
         num_channels = input.num_filters
     assert num_channels % groups == 0
-    Layer(name=name,
+    l = Layer(name=name,
           inputs=Input(input.name,
                        maxout=MaxOut(channels=num_channels,
                                      groups=groups)),
           type=LayerType.MAXOUT,
           **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.MAXOUT, parents=[input])
+    return LayerOutput(name, LayerType.MAXOUT, parents=[input],
+                       size=l.config.size)
 
 
 @wrap_name_default()
@@ -3718,7 +3835,10 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None,
     parents = [input, label]
     if weight is not None:
         parents.append(weight)
-    return LayerOutput(name, LayerType.CRF_LAYER, parents, size=size)
+    # The size for LayerOutput means the dimension of the output.
+    # It's different from the meaning of crf layer, which is the number of
+    # classes.
+    return LayerOutput(name, LayerType.CRF_LAYER, parents, size=1)
 
 
 @wrap_name_default()
@@ -3766,7 +3886,10 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None,
     parents = [input]
     if label is not None:
         parents.append(label)
-    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
+    # The size for LayerOutput means the dimension of the output.
+    # It's different from the meaning of crf layer, which is the number of
+    # classes.
+    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
 
 @wrap_bias_attr_default(has_bias=True)
 @wrap_name_default()
@@ -3834,7 +3957,7 @@ def nce_layer(input, label, num_classes, weight=None,
         ipts_for_layer.append(weight.name)
         parents.append(weight)
 
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.NCE_LAYER,
         num_classes=num_classes,
@@ -3844,7 +3967,8 @@ def nce_layer(input, label, num_classes, weight=None,
         bias=ParamAttr.to_bias(bias_attr),
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
-    return LayerOutput(name, LayerType.NCE_LAYER, parents=parents)
+    return LayerOutput(name, LayerType.NCE_LAYER, parents=parents,
+                       size=l.config.size)
 
 """
 following are cost Layers.
@@ -3919,7 +4043,7 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0, layer_attr=
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
-    return LayerOutput(name, LayerType.RANK_COST, parents=parents)
+    return LayerOutput(name, LayerType.RANK_COST, parents=parents, size=1)
 
 
 @wrap_name_default()
@@ -3971,7 +4095,8 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1, layer_attr=Non
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
-    return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score])
+    return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score],
+                       size=1)
 
 
 @wrap_name_default()
@@ -3982,14 +4107,13 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
 
     .. code-block:: python
 
-       cost = cross_entropy(input, label)
+       cost = cross_entropy(input=input_layer, 
+                            label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param type: The type of cost.
-    :type type: basestring.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
@@ -4006,7 +4130,8 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
           coeff=coeff,
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
-    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label])
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label],
+                       size=1)
 
 
 @wrap_name_default()
@@ -4019,14 +4144,13 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
 
     .. code-block:: python
 
-       cost = cross_entropy_with_selfnorm(input, label)
+       cost = cross_entropy_with_selfnorm(input=input_layer, 
+                                          label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param type: The type of cost.
-    :type type: basestring.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
@@ -4048,7 +4172,39 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
 
     return LayerOutput(name,
                        LayerType.CROSS_ENTROPY_WITH_SELFNORM,
-                       parents=[input, label])
+                       parents=[input, label], size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def sum_cost(input, name=None, layer_attr=None):
+    """
+    A loss layer which calculate the sum of the input as loss
+
+    .. code-block:: python
+
+       cost = sum_cost(input=input_layer)
+
+    :param input: The first input layer.
+    :type input: LayerOutput.
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput.
+    """
+    assert isinstance(input, LayerOutput)
+    Layer(name=name,
+          type=LayerType.SUM_COST,
+          inputs=[input.name],
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
+          )
+
+    return LayerOutput(name,
+                       LayerType.SUM_COST,
+                       parents=[input],
+                       size=1)
 
 
 @wrap_name_default()
@@ -4059,7 +4215,8 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 
     .. code-block:: python
 
-       cost = huber_cost(input, label)
+       cost = huber_cost(input=input_layer, 
+                         label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput.
@@ -4083,7 +4240,7 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
           coeff=coeff,
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
-    return LayerOutput(name, LayerType.HUBER, parents=[input, label])
+    return LayerOutput(name, LayerType.HUBER, parents=[input, label], size=1)
 
 
 @wrap_name_default()
@@ -4095,7 +4252,8 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0,
 
     .. code-block:: python
 
-       cost = multi_binary_label_cross_entropy(input, label)
+       cost = multi_binary_label_cross_entropy(input=input_layer, 
+                                               label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput
@@ -4126,4 +4284,4 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0,
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
     return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
-                       parents=[input, label])
+                       parents=[input, label], size=1)
diff --git a/python/paddle/trainer_config_helpers/math.py b/python/paddle/trainer_config_helpers/math.py
index e35849b77ac531b4a4676019e01285af67925bd9..7d7bb2914859fbda222b41a7bbe568b283b32487 100644
--- a/python/paddle/trainer_config_helpers/math.py
+++ b/python/paddle/trainer_config_helpers/math.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 from .layers import LayerOutput, mixed_layer, identity_projection, \
-    slope_intercept_layer
+    slope_intercept_layer, scaling_layer, repeat_layer
 from .attrs import is_compatible_with
 from .default_decorators import *
 import activations as act
+from paddle.trainer.config_parser import logger
 
 __all__ = []
 
@@ -40,7 +41,21 @@ register_unary_math_op('square', act.SquareActivation())
 def add(layeroutput, other):
     if is_compatible_with(other, float):
         return slope_intercept_layer(input=layeroutput, intercept=other)
-    assert isinstance(other, LayerOutput)
+    if not isinstance(other, LayerOutput):
+        logger.fatal("LayerOutput can only be added with"
+                     " another LayerOutput or a number")
+    if layeroutput.size == other.size:
+        return mixed_layer(input=[identity_projection(input=layeroutput),
+                                  identity_projection(input=other)])
+    if other.size != 1 and layeroutput.size != 1:
+        logger.fatal("Two LayerOutput can be added only if they have equal size"
+                     " or one of their sizes is 1. sizes are %s and %s" %
+                     (layeroutput.size, other.size))
+    elif layeroutput.size == 1:
+        tmp = layeroutput
+        layeroutput = other
+        other = tmp
+    other = repeat_layer(other, layeroutput.size)
     return mixed_layer(input=[identity_projection(input=layeroutput),
                               identity_projection(input=other)])
 
@@ -50,10 +65,11 @@ LayerOutput.__add__ = add
 def sub(layeroutput, other):
     if is_compatible_with(other, float):
         return slope_intercept_layer(input=layeroutput, intercept=other)
-    assert isinstance(other, LayerOutput)
+    if not isinstance(other, LayerOutput):
+        logger.fatal("LayerOutput can only be subtracted with"
+                     " another Layeroutput or a number")
     neg = slope_intercept_layer(input=other, slope=-1.0)
-    return mixed_layer(input=[identity_projection(input=layeroutput),
-                              identity_projection(input=neg)])
+    return add(layeroutput, neg)
 
 LayerOutput.__sub__ = sub
 
@@ -62,3 +78,20 @@ def rsub(layeroutput, other):
     return add(neg, other)
 
 LayerOutput.__rsub__ = rsub
+
+def mul(layeroutput, other):
+    if is_compatible_with(other, float):
+        return slope_intercept_layer(input=layeroutput, slope=other)
+    if not isinstance(other, LayerOutput):
+        logger.fatal("LayerOutput can only be multiplied with"
+                     " another Layeroutput or a number")
+    elif layeroutput.size == 1:
+        return scaling_layer(input=other, weight=layeroutput)
+    elif other.size == 1:
+        return scaling_layer(input=layeroutput, weight=other)
+    else:
+        logger.fatal("At least one of the operand of '*' must be a number"
+                     " or a LayerOutput with size=1")
+
+LayerOutput.__mul__ = mul
+LayerOutput.__rmul__ = mul
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index cafc2142f25c74d54ec8a1ab937db23306da2904..e84e2a4b7f36a42999b2decff2b277cbbd56cda0 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -11,8 +11,8 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_spilit_datasource)
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
+test_split_datasource)
 
 
 for conf in ${configs[*]}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index fe515b7029336d093df5428ab8ac1c65a2d4e98a..7c2770c616dc11b1c69450dff9085c9943a2ff8f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -19,6 +19,12 @@ y = x + y
 y = y - x
 y = y - 2
 y = 2 - y
-
+y = 2 * y
+y = y * 3
+z= data_layer(name='data_2', size=1)
+y = y * z
+y = z * y
+y = y + z
+y = z + y
 outputs(y)
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
index 1767445c44bf5c0ea7c1149ad9fef2dd92508c54..da8da1b541f37a09654202f68232b99e4dac9f61 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -209,8 +209,129 @@ layers {
   slope: 1.0
   intercept: 2
 }
+layers {
+  name: "__slope_intercept_layer_6__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_5__"
+  }
+  slope: 2
+  intercept: 0.0
+}
+layers {
+  name: "__slope_intercept_layer_7__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_6__"
+  }
+  slope: 3
+  intercept: 0.0
+}
+layers {
+  name: "data_2"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__scaling_layer_0__"
+  type: "scaling"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  inputs {
+    input_layer_name: "__slope_intercept_layer_7__"
+  }
+}
+layers {
+  name: "__scaling_layer_1__"
+  type: "scaling"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  inputs {
+    input_layer_name: "__scaling_layer_0__"
+  }
+}
+layers {
+  name: "__repeat_layer_0__"
+  type: "featmap_expand"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  num_filters: 100
+}
+layers {
+  name: "__mixed_2__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__scaling_layer_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_2__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__repeat_layer_0__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_2__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__repeat_layer_1__"
+  type: "featmap_expand"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  num_filters: 100
+}
+layers {
+  name: "__mixed_3__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_2__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_3__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__repeat_layer_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_3__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+input_layer_names: "data_2"
 input_layer_names: "data"
-output_layer_names: "__slope_intercept_layer_5__"
+output_layer_names: "__mixed_3__"
 sub_models {
   name: "root"
   layer_names: "data"
@@ -228,8 +349,18 @@ sub_models {
   layer_names: "__slope_intercept_layer_3__"
   layer_names: "__slope_intercept_layer_4__"
   layer_names: "__slope_intercept_layer_5__"
+  layer_names: "__slope_intercept_layer_6__"
+  layer_names: "__slope_intercept_layer_7__"
+  layer_names: "data_2"
+  layer_names: "__scaling_layer_0__"
+  layer_names: "__scaling_layer_1__"
+  layer_names: "__repeat_layer_0__"
+  layer_names: "__mixed_2__"
+  layer_names: "__repeat_layer_1__"
+  layer_names: "__mixed_3__"
+  input_layer_names: "data_2"
   input_layer_names: "data"
-  output_layer_names: "__slope_intercept_layer_5__"
+  output_layer_names: "__mixed_3__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 5261cf0c44943689a957bb99c21075bb7341cd49..f6045fe1f68255daf0d9b5ab05034eec633e4503 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -23,6 +23,17 @@ layers {
   size: 10
   active_type: ""
 }
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 4
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
 layers {
   name: "__ctc_layer_0__"
   type: "ctc"
@@ -36,17 +47,6 @@ layers {
   }
   norm_by_times: false
 }
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 4
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
 layers {
   name: "crf_label"
   type: "data"
@@ -191,6 +191,16 @@ layers {
   }
   coeff: 1.0
 }
+layers {
+  name: "__sum_cost_0__"
+  type: "sum_cost"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  coeff: 1.0
+}
 parameters {
   name: "___fc_layer_0__.w0"
   size: 800
@@ -241,14 +251,15 @@ output_layer_names: "__cross_entropy_0__"
 output_layer_names: "__cross_entropy_with_selfnorm_0__"
 output_layer_names: "__huber_cost_0__"
 output_layer_names: "__multi_binary_label_cross_entropy_0__"
+output_layer_names: "__sum_cost_0__"
 sub_models {
   name: "root"
   layer_names: "input"
   layer_names: "labels"
   layer_names: "probs"
   layer_names: "xe-label"
-  layer_names: "__ctc_layer_0__"
   layer_names: "__fc_layer_0__"
+  layer_names: "__ctc_layer_0__"
   layer_names: "crf_label"
   layer_names: "__crf_layer_0__"
   layer_names: "left"
@@ -264,6 +275,7 @@ sub_models {
   layer_names: "huber_label"
   layer_names: "__huber_cost_0__"
   layer_names: "__multi_binary_label_cross_entropy_0__"
+  layer_names: "__sum_cost_0__"
   input_layer_names: "input"
   input_layer_names: "labels"
   input_layer_names: "crf_label"
@@ -284,6 +296,7 @@ sub_models {
   output_layer_names: "__cross_entropy_with_selfnorm_0__"
   output_layer_names: "__huber_cost_0__"
   output_layer_names: "__multi_binary_label_cross_entropy_0__"
+  output_layer_names: "__sum_cost_0__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8b0a8f2146b709ee67981049da8061597e1716be
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
@@ -0,0 +1,34 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 3200
+  active_type: ""
+}
+layers {
+  name: "__spp_0__"
+  type: "spp"
+  size: 80
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    spp_conf {
+      pool_type: "max-projection"
+      pyramid_height: 2
+      channels: 16
+      img_size: 10
+      img_size_y: 20
+    }
+  }
+}
+input_layer_names: "data"
+output_layer_names: "__spp_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__spp_0__"
+  input_layer_names: "data"
+  output_layer_names: "__spp_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index 64b45f4ded10b09ec4a7e77499e2d7b21215f430..cfaf2da001106289db09a87f6d9935e8f07ceca0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -11,8 +11,9 @@ labels = data_layer(name='labels', size=5000)
 probs = data_layer(name='probs', size=10)
 xe_label = data_layer(name='xe-label', size=10)
 
+hidden = fc_layer(input=seq_in, size=4)
 outputs(ctc_layer(input=seq_in, label=labels),
-        crf_layer(input=fc_layer(input=seq_in, size=4),
+        crf_layer(input=hidden,
                   label=data_layer(name='crf_label', size=4)),
         rank_cost(left=data_layer(name='left', size=1),
                   right=data_layer(name='right', size=1),
@@ -23,4 +24,5 @@ outputs(ctc_layer(input=seq_in, label=labels),
         cross_entropy_with_selfnorm(input=probs, label=xe_label),
         huber_cost(input=data_layer(name='huber_probs', size=1),
                    label=data_layer(name='huber_label', size=1)),
-        multi_binary_label_cross_entropy(input=probs, label=xe_label))
+        multi_binary_label_cross_entropy(input=probs, label=xe_label),
+        sum_cost(input=hidden))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..178387d3cf1d16bd391aad8e08950554e83fbeff
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=100,
+    learning_rate=1e-5
+)
+
+data = data_layer(name='data', size=3200)
+
+spp = spp_layer(input=data,
+                pyramid_height=2,
+                num_channels=16,
+                pool_type=MaxPooling(),
+                img_width=10)
+
+outputs(spp)