From 45c81a414f912b93c6ad34fa4045e8e80fd396f6 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Wed, 2 Nov 2016 09:56:20 +0800
Subject: [PATCH] Add job=time in trainer, refine cudnn_conv to reduce gpu
 memory and speed up training. (#218)

* Add benchmark for PaddlePaddle, tensorflow and caffe

* ConvProjection to reduce memory for goolenet

* Add unit test for ConvProjection.
1. unit test in test_LayerGrad.
2. compare the ConvPorjection and CudnnConvLayer, also compare the concat_layer+img_conv_layer and concat_layer_conv_projection.

* Reduce cudnn_conv memory and add benchmark document.
1. Use TmpMatrix as the workspace in cudnn_conv to reduce gpu memory. It reduce lots of memory.
2. Add benchmark document.
3. fix smallnet_mnist_cifar.py in paddle.

* Add job=time and refine cudnn_conv to reduce gpu memroy and speed up

* Refine cudnn_conv and shared biases operation in concat_layer and mixed_layer.

* follow comments

* follow comments

* Use unique_ptr to prevent memory leaks in CudnnConvLayer.
---
 doc/ui/cmd_argument/argument_outline.md       |   7 +-
 doc/ui/cmd_argument/detail_introduction.md    |   4 +
 paddle/cuda/include/hl_device_functions.cuh   |  19 ++
 paddle/cuda/include/hl_matrix.h               |  36 +++
 paddle/cuda/include/stub/hl_matrix_stub.h     |  13 +
 paddle/cuda/src/hl_cuda_cudnn.cc              |   7 +-
 paddle/cuda/src/hl_cuda_matrix.cu             |  87 ++++++
 paddle/cuda/src/hl_cuda_sparse.cuh            |  18 --
 paddle/gserver/layers/ConcatenateLayer.cpp    |  33 ++-
 paddle/gserver/layers/ConvBaseLayer.cpp       |  51 ++--
 paddle/gserver/layers/ConvBaseLayer.h         |  24 +-
 paddle/gserver/layers/ConvProjection.cpp      | 210 ++++++++++++++
 paddle/gserver/layers/ConvProjection.h        | 125 +++++++++
 paddle/gserver/layers/CudnnConvLayer.cpp      | 256 +++---------------
 paddle/gserver/layers/CudnnConvLayer.h        |  73 +----
 paddle/gserver/layers/ExpandConvLayer.cpp     |  39 ++-
 paddle/gserver/layers/ExpandConvLayer.h       |  10 +-
 paddle/gserver/layers/MixedLayer.cpp          |  20 +-
 paddle/gserver/layers/MixedLayer.h            |   1 +
 paddle/gserver/tests/LayerGradUtil.cpp        |   6 +-
 paddle/gserver/tests/LayerGradUtil.h          |   3 +-
 paddle/gserver/tests/img_conv_a.conf          |  39 +++
 paddle/gserver/tests/img_conv_b.conf          |  32 +++
 paddle/gserver/tests/test_LayerGrad.cpp       |  39 +++
 paddle/gserver/tests/test_NetworkCompare.cpp  |   9 +
 paddle/math/Matrix.cpp                        |  52 ++++
 paddle/math/Matrix.h                          |  28 ++
 paddle/math/tests/test_matrixCompare.cpp      |  56 ++++
 paddle/trainer/CMakeLists.txt                 |   1 +
 paddle/trainer/Trainer.h                      |   1 +
 paddle/trainer/TrainerBenchmark.cpp           |  71 +++++
 paddle/trainer/TrainerMain.cpp                |   2 +
 proto/ModelConfig.proto.m4                    |   5 +-
 python/paddle/trainer/config_parser.py        |  68 ++++-
 .../paddle/trainer_config_helpers/layers.py   | 105 ++++++-
 .../paddle/trainer_config_helpers/networks.py | 165 ++++++++++-
 .../tests/configs/check.md5                   |   9 +-
 .../tests/configs/generate_protostr.sh        |   2 +-
 .../tests/configs/test_bi_grumemory.py        |  10 +
 39 files changed, 1340 insertions(+), 396 deletions(-)
 create mode 100644 paddle/gserver/layers/ConvProjection.cpp
 create mode 100644 paddle/gserver/layers/ConvProjection.h
 create mode 100644 paddle/gserver/tests/img_conv_a.conf
 create mode 100644 paddle/gserver/tests/img_conv_b.conf
 create mode 100644 paddle/trainer/TrainerBenchmark.cpp
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
diff --git a/doc/ui/cmd_argument/argument_outline.md b/doc/ui/cmd_argument/argument_outline.md
index 98dadc270d..d6cc2c6ed7 100644
--- a/doc/ui/cmd_argument/argument_outline.md
+++ b/doc/ui/cmd_argument/argument_outline.md
@@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>
 
 <tr>
-<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>
 
@@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>
 
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
 <tr>
 <td class="left" rowspan = "4">RNN</td>
 <td class="left">beam_size</td>
diff --git a/doc/ui/cmd_argument/detail_introduction.md b/doc/ui/cmd_argument/detail_introduction.md
index 0d0362d022..07608e5edf 100644
--- a/doc/ui/cmd_argument/detail_introduction.md
+++ b/doc/ui/cmd_argument/detail_introduction.md
@@ -163,6 +163,10 @@
   - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
   - type: string (default: "", null)
 
+* `--cudnn_conv_workspace_limit_in_mb`
+  - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. 
+  - type: int32 (default: 4096MB=4GB)
+
 ## NLP: RNN/LSTM/GRU
 * `--rnn_use_batch`
   - Whether to use batch method for calculation in simple RecurrentLayer.
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
index 88d950d6c1..159c26f443 100755
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
 }
 }  // namespace paddle
 
+/**
+ * @brief  sum reduction
+ *
+ * @param[in,out]  smem       input data, better to use __shared__ memory.
+ * @param[in]      tid        thread index.
+ * @param[in]      threads    the total thread number used to reduce,
+ *                            such as, blockDim.x.
+ *
+ * @return smem[0]: the sum of each elements in smem.
+ */
+__device__ __forceinline__
+void simpleReduce(real* smem, int tid, int threads) {
+  for (unsigned int s = threads / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      smem[tid] += smem[tid + s];
+    }
+    __syncthreads();
+  }
+}
 
 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 1741979047..71e8f8e3a6 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -229,4 +229,40 @@ extern void hl_cossim_derivative(real* grad,
                                  int input2_height,
                                  real scale);
 
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale);
+
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index f1f1020c84..e37b127543 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -101,4 +101,17 @@ inline void hl_cossim_derivative(real* grad,
                                  int input2_height,
                                  real scale) {}
 
+inline void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale) {}
+
+inline void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale) {}
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index b215c0f6e3..7810d0d100 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -20,6 +20,11 @@ limitations under the License. */
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
+                "Specify cuDNN max workspace limit, in units MB, "
+                "4096MB=4GB by default.");
 
 namespace dynload {
 
@@ -242,7 +247,7 @@ void hl_conv_workspace(hl_tensor_descriptor input,
     CHECK_NOTNULL(conv);
 
     // Specify workspace limit directly
-    size_t memoryLimitBytes = 8 * 1024 * 1024;
+    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
 
     // cudnn convolution forward configuration
     cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 067e68c41e..3df9f63f9e 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
 #include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
@@ -673,3 +674,89 @@ void hl_cossim_derivative(real* grad,
         input1_height, input2_height, scale);
   CHECK_SYNC("hl_cossim_derivate failed");
 }
+
+__global__ void KeMatrixAddSharedBias(real* A,
+                                      real* B,
+                                      const int channel,
+                                      const int M,
+                                      const int N,
+                                      real scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int dim = N / channel;
+  if (index < M * N) {
+    int i = index % N;
+    i = i / dim; 
+    A[index] += scale * B[i];
+  }
+}
+
+void hl_matrix_add_shared_bias(real* A_d,
+                               real* B_d,
+                               const int channel,
+                               const int dimM,
+                               const int dimN,
+                               real scale) {
+  const int blocks = 512;
+  const int grids = DIVUP(dimM * dimN, blocks);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
+    (A_d, B_d, channel, dimM, dimN, scale);
+  CHECK_SYNC("hl_matrix_add_shared_bias failed");
+}
+
+
+template <int blockSize>
+__global__ void KeMatrixCollectSharedBias(real *B,
+                                          real *A,
+                                          const int channel,
+                                          const int M,
+                                          const int N,
+                                          const int dim,
+                                          const int limit,
+                                          real scale) {
+  if (dim < limit) { 
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < channel) {
+      real sum = 0.0;
+      for (int i = 0; i < M; ++i) {
+        for (int j = 0; j < dim; ++j) {
+          sum += A[i * N + index * dim + j];
+        }
+      }
+      B[index] += scale * sum;
+    }
+  } else {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    __shared__ real smem[blockSize];
+    real sum = 0.0;
+    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
+      int n = j * blockSize + tid;
+      int m = n / dim;
+      int w = n % dim;
+      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      __syncthreads();
+      simpleReduce(smem, tid, blockSize);
+      sum += smem[0];
+    }
+    if (tid == 0) {
+      B[bid] += scale * sum;
+    }
+  }
+}
+
+void hl_matrix_collect_shared_bias(real* B_d,
+                                   real* A_d,
+                                   const int channel,
+                                   const int dimM,
+                                   const int dimN,
+                                   real scale) {
+  const int dim = dimN / channel;
+  const int blocks = 256;
+  const int limit = 64;
+  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
+
+  KeMatrixCollectSharedBias<blocks>
+      <<< grids, blocks, 0, STREAM_DEFAULT>>>
+      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index c3b98f4ebc..9cf2d5a843 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
   return (end - 1);
 }
 
-/**
- * @brief  sum reduction
- *
- * @param[in,out]  smem       input data, better to use __shared__ memory.
- * @param[in]      tid        local thread index.
- * @param[in]      blockDimX  the size of blockDim.x.
- *
- * note: return smem[0]: the sum of each elements of smem.
- */
-__device__ __forceinline__
-void reduce(real* smem, int tid, int blockDimX) {
-  for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      smem[tid] += smem[tid + s];
-    }
-    __syncthreads();
-  }
-}
 
 /**
  * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index 52a7cb6f77..bb6709b8df 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -97,7 +97,8 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
  */
 class ConcatenateLayer2 : public Layer {
 public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
+  explicit ConcatenateLayer2(const LayerConfig& config) :
+      Layer(config) {}
 
   ~ConcatenateLayer2() {}
 
@@ -110,6 +111,8 @@ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
+  bool sharedBias_;
+  std::unique_ptr<Weight> biases_;
 };
 
 REGISTER_LAYER(concat2, ConcatenateLayer2);
@@ -119,7 +122,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   /* Initialize the basic parent class */
   if (!Layer::init(layerMap, parameterMap)) return false;
 
-  CHECK(!biasParameter_);
   CHECK_EQ(inputLayers_.size(), parameters_.size());
   projections_.reserve(inputLayers_.size());
   projCol_.reserve(inputLayers_.size());
@@ -137,6 +139,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   }
   CHECK_EQ(getSize(), endCol);
 
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
   return true;
 }
 
@@ -154,8 +163,17 @@ void ConcatenateLayer2::forward(PassType passType) {
     projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
   }
 
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+  {
+    AsyncGpuBlock block;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+    }
+  }
+
+  /* add the bias-vector */
+  if (biases_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
   }
 
   /* activation */ {
@@ -170,6 +188,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
     backwardActivation();
   }
 
+  AsyncGpuBlock block;
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
     if (projections_[i]) {
       projections_[i]->backward(callback);
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 9ed9572139..040510b7ad 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -35,25 +35,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     filterSizeY_.push_back(conf.filter_size_y());
     filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
-    imgSize_.push_back(conf.img_size());
-    imgPixels_.push_back(imgSize_.back() * imgSize_.back());
+    imgSizeH_.push_back(conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
     groups_.push_back(conf.groups());
     filterChannels_.push_back(conf.filter_channels());
-    outputX_.push_back(conf.output_x());
-    outputs_.push_back(outputX_.back() * outputX_.back());
-  }
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = numFilters_;
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
+    outputH_.push_back(conf.output_x());
+    outputW_.push_back(conf.output_x());
   }
 
   /* initialize the biases_ */
@@ -74,4 +61,34 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+    if (imgSizeH_[i] == 0)
+      imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
+    if (imgSizeW_[i] == 0)
+      imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
+    outputH_.push_back(
+        outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
+    outputW_.push_back(
+        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
+    CHECK_EQ(outputH_[i], outputH_[0]);
+    CHECK_EQ(outputW_[i], outputW_[0]);
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  layerSize = outputH_[0] * outputW_[0] * size_t(numFilters_);
+  return layerSize;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index eaeaebf43b..316514acf1 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -43,19 +43,18 @@ protected:
   IntV filterSizeY_;
   /// The spatial dimensions of the convolution input.
   IntV channels_;
-  /// The spatial dimensions of input feature map.
-  IntV imgSize_;
-  /// The total pixel size of input feature map.
-  /// imgPixels_ = imgSizeX_ * imgSizeY_.
-  IntV imgPixels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
   /// filterPixels_ = filterSizeX_ * filterSizeY_.
   IntV filterPixels_;
   /// filterChannels_ = channels_/groups_.
   IntV filterChannels_;
-  /// The spatial dimensions of output feature map.
-  IntV outputX_;
-  /// The spatial dimensions of output feature map.
-  IntV outputs_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
   /// filters are only connected to the first half of the input channels,
@@ -80,6 +79,13 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
+  virtual size_t calOutputSize();
+
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
   /**
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
new file mode 100644
index 0000000000..d1ce53fe26
--- /dev/null
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "ConvProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(conv, ConvProjection);
+
+ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
+
+ConvProjection::ConvProjection(const ProjectionConfig& config,
+                               ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.img_size();
+  configImgW_ = conf.img_size();
+
+  channels_ = conf.channels();
+  numFilters_ = config_.num_filters();
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
+                              filterH_, filterW_);
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
+                                   paddingH_, paddingW_, strideH_, strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvProjection::reshapeTensorDesc(int batchSize) {
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_,
+                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
+                    imageW_, 1);
+  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
+                                  paddingH_, paddingW_, strideH_, strideW_);
+
+  // The stride between two consecutive images in ConvProjection may not be 1,
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  // In fact, only "nStride = out_->value->getStride()" is ok.
+  size_t nStride = numFilters_ * outputH_ * outputW_;
+  if (out_->value->isContiguous()) {
+    CHECK_EQ(nStride, out_->value->getWidth());
+  } else {
+    nStride = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_,
+                    nStride, outputH_ * outputW_, outputW_, 1);
+}
+
+void ConvProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
+                      convDesc_, &fwdAlgo_, &fwdLimitBytes_,
+                      &bwdDataAlgo_, &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+                         << " / " << bwdDataAlgo_
+                         << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+
+    real *inputData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_forward(inputDesc_, inputData, outputDesc_,
+                           outData, filterDesc_, wgtData,
+                           convDesc_, workSpace,
+                           fwdLimitBytes_, fwdAlgo_);
+  }
+}
+
+void ConvProjection::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inputData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(
+          inputDesc_, inputData, outputDesc_, outGrad, filterDesc_,
+          weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_,
+          bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inputGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g* weightOffset_;
+      hl_convolution_backward_data(
+          inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_,
+          wgtData, convDesc_, workSpace, bwdDataLimitBytes_,
+          bwdDataAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void* ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle*>& convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle** localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvProjection::~ConvProjection() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
new file mode 100644
index 0000000000..41a100ac3c
--- /dev/null
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                 bool useGpu);
+
+  ~ConvProjection();
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  int outputSize(int imageSize, int filterSize, int padding, int stride) {
+    return (imageSize - filterSize + 2 * padding) / stride + 1;
+  }
+
+  size_t calOutputSize() {
+    imageH_ = in_->getFrameHeight();
+    imageW_ = in_->getFrameWidth();
+    if (imageH_ == 0) imageH_ = configImgH_;
+    if (imageW_ == 0) imageW_ = configImgW_;
+    outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_);
+    outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_);
+
+    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
+    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
+
+    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
+    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
+    return outputH_ * outputW_ * numFilters_;
+  }
+
+  static void* getSpaceBytes(size_t size);
+
+  /// imageH_ and imageW_ is calculated from the input layer.
+  int imageH_, imageW_;
+  /// configImgH_ and configImgW_ is obtained from config.
+  int configImgH_, configImgW_;
+  int outputH_, outputW_;
+  int channels_, numFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor inputDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index 0f932f960f..e77216f17c 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -22,215 +22,64 @@ REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
 
 bool CudnnConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
-  ConvBaseLayer::init(layerMap, parameterMap);
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
   CHECK(useGpu_) << "CudnnConvLayer only support gpu";
 
-  maxGroups_ = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(channels_[i] % groups_[i], 0);
-    CHECK_EQ(numFilters_ % groups_[i], 0);
-
-    hl_filter_descriptor filter;
-    hl_create_filter_descriptor(&filter, channels_[i] / groups_[i],
-                                numFilters_ / groups_[i], filterSizeY_[i],
-                                filterSize_[i]);
-    filterDesc_.push_back(filter);
-
-    hl_tensor_descriptor input;
-    hl_create_tensor_descriptor(&input);
-    inputDesc_.push_back(input);
-
-    hl_tensor_descriptor output;
-    int outputX =
-        outputSize(imgSize_[i], filterSize_[i], padding_[i], stride_[i]);
-    CHECK_EQ(outputX, outputX_[i]);
-    hl_create_tensor_descriptor(&output);
-    outputDesc_.push_back(output);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());
 
-    hl_convolution_descriptor conv;
-    hl_create_convolution_descriptor(&conv, input, filter, paddingY_[i],
-                                     padding_[i], strideY_[i], stride_[i]);
-    convDesc_.push_back(conv);
-
-    weightOffset_.push_back((numFilters_ / groups_[i]) *
-                            (channels_[i] / groups_[i]) * filterPixels_[i]);
-    inputOffset_.push_back((channels_[i] / groups_[i]) * imgSize_[i] *
-                           imgSize_[i]);
-    outputOffset_.push_back((numFilters_ / groups_[i]) * outputX_[i] *
-                            outputX_[i]);
-
-    // initialize all to default algorithms
-    fwdAlgo_.push_back(0);
-    bwdFilterAlgo_.push_back(0);
-    bwdDataAlgo_.push_back(0);
-    fwdLimitBytes_.push_back(0);
-    bwdFilterLimitBytes_.push_back(0);
-    bwdDataLimitBytes_.push_back(0);
-
-    // cudnn streams per group equal to 1
-    if (groups_[i] > maxGroups_) {
-      maxGroups_ = groups_[i];
-    }
-  }
-
-  workSpaceInBytes_ = 0;
-  workSpaceData_ = NULL;
-  for (int i = 0; i < maxGroups_; ++i) {
-    workSpace_.push_back(NULL);
+  numFilters_ = config_.num_filters();
+  CHECK(config_.shared_biases());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    ProjectionConfig* conf = new ProjectionConfig();
+    conf->set_type("conv");
+    conf->set_num_filters(numFilters_);
+    conf->set_allocated_conv_conf(
+        config_.mutable_inputs(i)->mutable_conv_conf());
+    conf->set_input_size(getPrev(i)->getSize());
+    conf->set_output_size(getSize());
+    projConf_.emplace_back(conf);
+    projections_.emplace_back(Projection::create(*projConf_[i],
+                                                 parameters_[i], useGpu_));
   }
 
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
+    hl_create_tensor_descriptor(&outputDesc_);
     hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
     biasOffset_ = numFilters_ / groups_[0];
   }
 
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
   return true;
 }
 
-void CudnnConvLayer::allocConvWorkSpace(size_t maxWorkSpace) {
-  size_t totalWorkSpace = maxWorkSpace * maxGroups_;
-
-  if (totalWorkSpace  > workSpaceInBytes_) {
-      if (workSpaceInBytes_ != 0) {
-          hl_free_mem_device(workSpaceData_);
-      }
-      // total amount of storage needed over all groups
-      workSpaceData_ = hl_malloc_device(totalWorkSpace);
-
-      // update work space address for each group
-      for (int i = 0; i < maxGroups_; ++i) {
-            workSpace_[i] = reinterpret_cast<char *>(workSpaceData_)
-                                  + i * maxWorkSpace;
-      }
-      workSpaceInBytes_ = totalWorkSpace;
-  }
-}
-
-void CudnnConvLayer::reshape(int batchSize) {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_[0];
-  if (imageW_ == 0) imageW_ = imgSize_[0];
-
-  for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int imageH = inputLayers_[i]->getOutput().getFrameHeight();
-    int imageW = inputLayers_[i]->getOutput().getFrameWidth();
-    if (imageH) {
-      CHECK_EQ(imageH_, imageH) << "Inputs must have same height.";
-    }
-    if (imageW) {
-      CHECK_EQ(imageW_, imageW) << "Inputs must have same width.";
-    }
-  }
-
-  outputH_ = outputSize(imageH_, filterSizeY_[0], paddingY_[0], strideY_[0]);
-  outputW_ = outputSize(imageW_, filterSize_[0], padding_[0], stride_[0]);
-  // check outputH & outputW
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  // if the batchSize remains the same, set isSelectAlgo_ true.
-  // Otherwise, set isSelectAlgo_ false and select algo again.
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  size_t maxWorkSpace = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
-             (size_t)(channels_[i] * imageH_ * imageW_));
-
-    hl_tensor_reshape(inputDesc_[i], batchSize, channels_[i] / groups_[i],
-                      imageH_, imageW_, channels_[i] * imageH_ * imageW_,
-                      imageH_ * imageW_, imageW_, 1);
-
-    hl_tensor_reshape(outputDesc_[i], batchSize, numFilters_ / groups_[i],
-                      outputH_, outputW_, numFilters_ * outputH_ * outputW_,
-                      outputH_ * outputW_, outputW_, 1);
-
-    hl_reset_convolution_descriptor(convDesc_[i], inputDesc_[i],
-                                    filterDesc_[i], paddingY_[i],
-                                    padding_[i], strideY_[i], stride_[i]);
-
-    inputOffset_[i] = (channels_[i] / groups_[i]) * imageH_ * imageW_;
-    outputOffset_[i] = (numFilters_ / groups_[i]) * outputH_ * outputW_;
-
-    if (!isSelectAlgo_) {
-      hl_conv_workspace(inputDesc_[i], outputDesc_[i], filterDesc_[i],
-                        convDesc_[i], &fwdAlgo_[i], &fwdLimitBytes_[i],
-                        &bwdDataAlgo_[i], &bwdDataLimitBytes_[i],
-                        &bwdFilterAlgo_[i], &bwdFilterLimitBytes_[i]);
-
-      maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
-      maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
-
-      VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
-                           << " / " << bwdDataAlgo_[i]
-                           << " / " << bwdFilterAlgo_[i];
-    }
-  }
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace(maxWorkSpace);
-  }
-
-  isSelectAlgo_ = true;
-}
-
 void CudnnConvLayer::forward(PassType passType) {
   Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * numFilters_);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, calOutputSize());
 
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-      real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[i] * g;
-      hl_convolution_forward(inputDesc_[i], inputData, outputDesc_[i],
-                             outData, filterDesc_[i], wgtData,
-                             convDesc_[i], workSpace_[g],
-                             fwdLimitBytes_[i], fwdAlgo_[i]);
-    }
+    projections_[i]->forward(&getInput(i), &getOutput(), passType);
   }
 
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    addBiases();
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvLayer::addBiases() {
-  if (sharedBiases_) {
+    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0],
+        outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0],
+        outputH_[0] * outputW_[0], outputW_[0], 1);
+    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
     for (int g = 0; g < groups_[0]; ++g) {
       real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[0] * g;
+      real *outData = getOutputValue()->getData() + outputOffset_ * g;
       hl_convolution_forward_add_bias(biasDesc_, biasData,
-                                      outputDesc_[0], outData);
+                                      outputDesc_, outData);
     }
-  } else {
-    LOG(FATAL) << "Not supported";
   }
-}
 
-void CudnnConvLayer::bpropBiases() {
-  if (sharedBiases_) {
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[0] * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad,
-                                   outputDesc_[0], outGrad);
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
+  forwardActivation();
 }
 
 void CudnnConvLayer::backward(const UpdateCallback &callback) {
@@ -238,52 +87,23 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    bpropBiases();
+    for (int g = 0; g < groups_[0]; ++g) {
+      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
+      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
+      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+    }
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[i] * g;
-      if (weights_[i]->getWGrad()) {
-        real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-        real *weightGrad =
-            weights_[i]->getWGrad()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_filter(
-            inputDesc_[i], inputData, outputDesc_[i], outGrad, filterDesc_[i],
-            weightGrad, convDesc_[i], workSpace_[g], bwdFilterLimitBytes_[i],
-            bwdFilterAlgo_[i]);
-      }
-
-      MatrixPtr preGrad = getInputGrad(i);
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_[i] * g;
-        real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_data(
-            inputDesc_[i], inputGrad, outputDesc_[i], outGrad, filterDesc_[i],
-            wgtData, convDesc_[i], workSpace_[g], bwdDataLimitBytes_[i],
-            bwdDataAlgo_[i]);
-      }
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
+    projections_[i]->backward(callback);
   }
 }
 
 CudnnConvLayer::~CudnnConvLayer() {
-  if (biasDesc_) {
+  if (biases_) {
     hl_destroy_tensor_descriptor(biasDesc_);
-  }
-
-  for (size_t i = 0; i < inputDesc_.size(); i++) {
-    hl_destroy_tensor_descriptor(inputDesc_[i]);
-    hl_destroy_tensor_descriptor(outputDesc_[i]);
-    hl_destroy_filter_descriptor(filterDesc_[i]);
-    hl_destroy_convolution_descriptor(convDesc_[i]);
-  }
-  if (workSpaceInBytes_ != 0) {
-    hl_free_mem_device(workSpaceData_);
-    workSpaceInBytes_ = 0;
+    hl_destroy_tensor_descriptor(outputDesc_);
   }
 }
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index a6dadba10d..6390d96315 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -17,12 +17,13 @@ limitations under the License. */
 
 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
+#include "Projection.h"
 #include <vector>
 
 namespace paddle {
 
 /**
- * @brief A subclass of ConvBaseLayer by cuDNN implementation. It only
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
  *        supports GPU mode. We automatic select CudnnConvLayer for GPU
  *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
  *        User also can specfiy type of "exconv" or "cudnn_conv" for
@@ -31,81 +32,21 @@ namespace paddle {
  * The config file api is img_conv_layer.
  */
 class CudnnConvLayer : public ConvBaseLayer {
-private:
-  /// resize Cudnn workspace size
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
 protected:
-  int imageH_, imageW_, outputH_, outputW_;
-  /// Cudnn tensor descriptor for bias.
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
   hl_tensor_descriptor biasDesc_;
-  /// Cudnn tensor descriptor for input.
-  std::vector<hl_tensor_descriptor> inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  std::vector<hl_tensor_descriptor> outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  std::vector<hl_filter_descriptor> filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  std::vector<hl_convolution_descriptor> convDesc_;
-  /// One sample offset of input data.
-  IntV inputOffset_;
-  /// One sample offset of output data.
-  IntV outputOffset_;
-  /// One group offset of weight.
-  IntV weightOffset_;
-  /// One group offset of bias.
+  hl_tensor_descriptor outputDesc_;
   int biasOffset_;
-
-  /// Save the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  std::vector<int> fwdAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  std::vector<int> bwdFilterAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// the output.
-  std::vector<int> bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  std::vector<size_t> fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  std::vector<size_t> bwdFilterLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  std::vector<size_t> bwdDataLimitBytes_;
-
-  /// Device work space address for each group.
-  std::vector<void*> workSpace_;
-  /// Max number of groups.
-  int maxGroups_;
-  /// Total work space address in device for all groups.
-  void* workSpaceData_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Is or not select conv algorihtm.
-  bool isSelectAlgo_;
-
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
+  int outputOffset_;
 
 public:
   explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
   ~CudnnConvLayer();
 
-  /**
-   * Intialization. Initialize member variables and create tenor descriptor.
-   */
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  /**
-   * Reshape is done each forward. Reshape tensor decriptor
-   * inputDesc_, outputDesc_, convDesc_. And search the faster algo
-   * or the fastest algo within a given memeory limit.
-   */
-  void reshape(int batchSize);
   void forward(PassType passType);
   void backward(const UpdateCallback& callback);
   void addBiases();
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index df79c3e303..80a6a62b5c 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -37,32 +37,29 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
     caffeMode_ = conf.caffe_mode();
   }
 
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = numFilters_;
+
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
+  }
+
   return true;
 }
 
-size_t ExpandConvLayer::getSize() {
+size_t ExpandConvLayer::getOutputSize() {
   CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeH_.clear();
-  imgSizeW_.clear();
-  outputH_.clear();
-  outputW_.clear();
+  size_t layerSize = ConvBaseLayer::calOutputSize();
   subN_.clear();
-  size_t layerSize = 0;
   for (size_t i = 0; i < inputLayers_.size(); i++) {
-    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-    if (imgSizeH_[i] == 0) imgSizeH_[i] = imgSize_[i];
-    if (imgSizeW_[i] == 0) imgSizeW_[i] = imgSize_[i];
-    outputH_.push_back(
-        outputSize(imgSizeH_[i], filterSize_[i], padding_[i], stride_[i]));
-    outputW_.push_back(
-        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
     subN_.push_back(outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || subN_[i] * size_t(numFilters_) == layerSize);
-    layerSize = subN_[i] * numFilters_;
   }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
   return layerSize;
 }
 
@@ -119,7 +116,7 @@ void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
 }
 
 void ExpandConvLayer::addSharedBias() {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = getOutputValue()->getWidth() / numFilters_;
   size_t mapH = getOutputValue()->getElementCnt() / mapW;
   MatrixPtr out =
       Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
@@ -158,7 +155,7 @@ void ExpandConvLayer::forward(PassType passType) {
    *   transOutValue correspond sample to one row */
   int batchSize = inputLayers_[0]->getOutputValue()->getWidth();
   batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getSize());
+  resetOutput(batchSize, getOutputSize());
 
   MatrixPtr image = nullptr;
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
@@ -183,7 +180,7 @@ void ExpandConvLayer::forward(PassType passType) {
 }
 
 void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = v->getWidth() / numFilters_;
   size_t mapH = v->getElementCnt() / mapW;
   MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index fc3d69b1b7..030a3ba397 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -37,14 +37,6 @@ protected:
   IntV subN_;
   /// subK_ = channels_ * filterPixels_ * groups_.
   IntV subK_;
-  /// The spatial dimensions of height of input feature map.
-  IntV imgSizeH_;
-  /// The spatial dimensions of width of input feature map.
-  IntV imgSizeW_;
-  /// The spatial dimensions of height of output feature map.
-  IntV outputH_;
-  /// The spatial dimensions of width of output feature map.
-  IntV outputW_;
   /// Expand one sample at a time. shape:
   /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
   MatrixPtr expandInput_;
@@ -58,7 +50,7 @@ public:
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  size_t getSize();
+  size_t getOutputSize();
 
   /**
    * Create or resize expandInput_.
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
index 054ddd3a22..26b1360290 100644
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -41,9 +41,13 @@ bool MixedLayer::init(const LayerMap& layerMap,
     }
     operators_.emplace_back(Operator::create(operator_conf, useGpu_));
   }
+
   /* initialize biases_ */
   if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(
+        new Weight(1, psize, biasParameter_));
   }
 
   return true;
@@ -119,12 +123,6 @@ void MixedLayer::forward(PassType passType) {
 
   MatrixPtr outV = getOutputValue();
 
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
     if (projections_[i]) {
       projections_[i]->forward(&getInput(i), &output_, passType);
@@ -140,6 +138,12 @@ void MixedLayer::forward(PassType passType) {
     op->forward(ins, &output_, passType);
   }
 
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
   /* activation */ {
     REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
     forwardActivation();
@@ -154,7 +158,7 @@ void MixedLayer::backward(const UpdateCallback& callback) {
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
 
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 9bac1355bd..5842e51e1d 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -58,5 +58,6 @@ protected:
   /// the matrix size of projection state
   std::vector<int> projectionStateMatrixSize_;
   std::unique_ptr<Weight> biases_;
+  bool sharedBias_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 552a6c5b41..bc7bee0e4b 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -669,12 +669,14 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
 
 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                         size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState) {
+                        bool testState, int biasSize, bool sharedBias) {
   TestConfig config;
   conf.set_name(conf.type());
   config.layerConfig.set_type("mixed");
   config.layerConfig.set_size(conf.output_size());
-  config.biasSize = config.layerConfig.size();
+  config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
+  config.layerConfig.set_bias_size(config.biasSize);
+  config.layerConfig.set_shared_biases(sharedBias);
   config.inputDefs.push_back(
       {inputType, "layer_0", conf.input_size(), parameterSize});
   *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 1e608dc062..3b9ec80395 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -217,7 +217,8 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
 
 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                         size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState = false);
+                        bool testState = false, int biasSize = 0,
+                        bool sharedBias = false);
 
 void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
                       size_t batchSize, bool useGpu, bool testState = false);
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/gserver/tests/img_conv_a.conf
new file mode 100644
index 0000000000..940589ed9a
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_a.conf
@@ -0,0 +1,39 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation())
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/gserver/tests/img_conv_b.conf
new file mode 100644
index 0000000000..8ca9c94541
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_b.conf
@@ -0,0 +1,32 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
+
+proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8, num_filters=16, stride=1)
+
+with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
+    conv += proj
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index eab9bf8414..bf2c2e0499 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -134,6 +134,45 @@ TEST(Projection, identity) {
   }
 }
 
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+  ProjectionConfig conf;
+  conf.set_type("conv");
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  int outputSize = (2 * conv->padding() + conv->img_size() -
+      conv->filter_size()) / conv->stride() + 1;
+  int outputSizeY = (2 * conv->padding_y() + conv->img_size() -
+      conv->filter_size_y()) / conv->stride_y() + 1;
+  conv->set_output_x(outputSize);
+  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+  conf.set_output_size(outputSize * outputSizeY * NUM_FILTERS);
+
+  testProjectionGrad(conf, INPUT_DATA,
+      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
+      /* batchSize */ 100, true, false, NUM_FILTERS, true);
+}
+#endif
+
+
 TEST(Layer, concat) {
   TestConfig config;
   config.biasSize = 0;
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index b3ef530673..8d3eac5aca 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -236,6 +236,15 @@ TEST(Compare, img_pool) {
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
 }
+
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
 #endif
 
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 843eabc97d..aaeae98f0d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -340,6 +340,15 @@ void GpuMatrix::addBias(Matrix& b, real scale) {
   BaseMatrix::addBias(b, scale);
 }
 
+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(),
+                            getHeight(), getWidth(), scale);
+}
+
+
 void GpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
@@ -354,6 +363,14 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
   }
 }
 
+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(),
+                                a.getHeight(), a.getWidth(), scale);
+}
+
+
 void GpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
@@ -1983,6 +2000,24 @@ void CpuMatrix::addBias(Matrix& b, real scale) {
   }
 }
 
+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
 void CpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
@@ -2000,6 +2035,23 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
   }
 }
 
+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
 void CpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 047c76a860..52cbed528c 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -343,11 +343,35 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
   /// add each sample from a to this.
   virtual void collectBias(Matrix& a, real scale) {
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
   virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
     int mode) {
     LOG(FATAL) << "Not implemented";
@@ -1021,6 +1045,7 @@ public:
 
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
 
   /**
    * @code
@@ -1028,6 +1053,7 @@ public:
    * @endcode
    */
   void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
@@ -1341,9 +1367,11 @@ public:
 public:
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
 
   /// add each sample of a to this.
   void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index ac160479a9..0ddf7e0dfc 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/math/SparseMatrix.h"
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/utils/Stat.h"
+
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -2071,6 +2073,60 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuData->addSharedBias(*cpuBias, 1.0);
+  gpuData->addSharedBias(*gpuBias, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
+  check->copyFrom(*gpuData);
+  MatrixCheckErr(*cpuData, *check);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuBias->collectSharedBias(*cpuData, 1.0);
+  gpuBias->collectSharedBias(*gpuData, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
+  check->copyFrom(*gpuBias);
+  MatrixCheckErr(*cpuBias, *check);
+}
+
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 08b411d2cc..06c019f0a9 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -7,6 +7,7 @@ set(TRAINER_SOURCES
         Tester.cpp
         Trainer.cpp
         TrainerInternal.cpp
+        TrainerBenchmark.cpp
         ThreadParameterUpdater.cpp
         TrainerInternalConfig.cpp
         TrainerConfigHelper.cpp)
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 4f4811a139..7762722456 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -99,6 +99,7 @@ public:
   void startTrainPass();
   void finishTrainPass();
   void trainOneDataBatch(DataBatch& dataBatch);
+  void time();
 
   /**
    * given a dataBatch and the current parameter value
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
new file mode 100644
index 0000000000..54862e95b4
--- /dev/null
+++ b/paddle/trainer/TrainerBenchmark.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+P_DECLARE_int32(test_period);
+
+P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+
+namespace paddle {
+
+void Trainer::time() {
+  startTrain();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
+                           << num << " != " << batchSize;
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  std::vector<paddle::Argument> outputs;
+  // burning time
+  LOG(INFO) << "Burning time...";
+  for (int n = 0; n < 10; ++n) {
+    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+  }
+  LOG(INFO) << "Burning time end.";
+
+  for (int n = 0; n < FLAGS_test_period; n++) {
+    if (FLAGS_feed_data) {
+      REGISTER_TIMER("GetData");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+
+    if (num != batchSize) {
+      break;
+    }
+
+    {
+      REGISTER_TIMER("FwdBwd");
+      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+    }
+  }
+  globalStat.setThreadInfo(true);
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+
+  finishTrain();
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 94266639f9..a486cc383a 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -103,6 +103,8 @@ int main(int argc, char** argv) {
     trainer.checkGradient();
   } else if (FLAGS_job == "test") {
     trainer.test();
+  } else if (FLAGS_job == "time") {
+    trainer.time();
   } else {
     LOG(FATAL) << "Unknown job type: " << FLAGS_job;
   }
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index 70c1f8d563..79e76b6bf1 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -255,7 +255,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   // (which is how convnets are usually trained). Setting this to
   // false will untie the biases, yielding a separate bias for
   // every location at which the filter is applied.
-  optional bool shared_biases = 8;
+  optional bool shared_biases = 8 [default = false];
 
   // Valid values are ones that divide the area of the output
   // grid in this convolutional layer. For example if this layer
@@ -379,6 +379,9 @@ sinclude(`ModelConfigLayer.proto.m4')
 
   // use to compute moving mean and variance.
   optional real moving_average_fraction = 47 [default = 0.9];
+
+  // bias size
+  optional uint32 bias_size = 48 [default = 0];
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index fe8a5e5d48..e909894316 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -632,6 +632,44 @@ class ContextProjection(Projection):
     _total_pad = 0
 
 
+@config_class
+class ConvProjection(Projection):
+    type = 'conv'
+
+    def __init__(
+            self,
+            input_layer_name,
+            num_filters=None,
+            conv_conf=None,
+            **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+
+        if num_filters is not None:
+            self.proj_conf.num_filters = num_filters
+
+        parse_conv(conv_conf,
+                   input_layer_name,
+                   self.proj_conf.conv_conf)
+        # TODO: support rectangle input
+        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x  ** 2) * num_filters
+
+    def calc_output_size(self, input_layer_config):
+        return self.proj_conf.output_size
+
+    def calc_parameter_size(self, input_size, output_size):
+        co = self.proj_conf.num_filters
+        ci = self.proj_conf.conv_conf.channels
+        fh = self.proj_conf.conv_conf.filter_size
+        fw = self.proj_conf.conv_conf.filter_size_y
+        return co * ci * fh * fw
+
+    def calc_bias_size(self):
+        return self.proj_conf.num_filters
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return None
+
+
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -2528,8 +2566,15 @@ class MixedLayer(LayerBase):
             record_operator_conf = self.config.operator_confs.add()
             record_operator_conf.CopyFrom(operator_conf)
 
+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()
 
-        self.create_bias_parameter(bias, self.config.size)
+        self.config.bias_size = psize
+        self.create_bias_parameter(bias, psize)
 
         if error_clipping_threshold is not None:
             self.config.error_clipping_threshold = error_clipping_threshold
@@ -2547,8 +2592,10 @@ class ConcatenateLayer(LayerBase):
             self,
             name,
             inputs,
+            bias=False,
             **xargs):
         config_assert(inputs, 'inputs cannot be empty')
+        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
         super(ConcatenateLayer, self).__init__(
             name, 'concat', 0, inputs=inputs, **xargs)
         size = 0
@@ -2567,10 +2614,19 @@ class ConcatenateLayer2(LayerBase):
             self,
             name,
             inputs,
+            bias=False,
             **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         super(ConcatenateLayer2, self).__init__(
             name, 'concat2', 0, inputs=inputs, **xargs)
+
+        if isinstance(self.inputs[0], ConvProjection):
+          for input_index in xrange(len(self.inputs) - 1):
+              input = self.inputs[input_index + 1]
+              config_assert(isinstance(input, ConvProjection),
+                  "The first input of ConcatenateLayer2 is ConvProjection, "
+                  "the other inputs should also be ConvProjection.")
+
         size = 0
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -2596,6 +2652,16 @@ class ConcatenateLayer2(LayerBase):
               input.proj_conf.output_size)
             self.create_input_parameter(input_index, psize, dims)
 
+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()
+
+        self.config.bias_size = psize
+        self.create_bias_parameter(bias, psize)
+
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
     def __init__(
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 7df9108ae8..9a23c02431 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -34,7 +34,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            "table_projection", "mixed_layer", "data_layer",
            "embedding_layer", "fc_layer", "grumemory",
            "pooling_layer", "lstmemory", "last_seq", "first_seq",
-           "cos_sim", "hsigmoid",
+           "cos_sim", "hsigmoid", "conv_projection",
            "regression_cost", 'classification_cost', "LayerOutput",
            'img_conv_layer', 'img_pool_layer', 'batch_norm_layer',
            'img_cmrnorm_layer', 'addto_layer',
@@ -1984,7 +1984,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
 @layer_support()
-def concat_layer(input, act=None, name=None, layer_attr=None):
+def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
     Concat all input vector into one huge vector.
     Inputs can be list of LayerOutput or list of projection.
@@ -2043,10 +2043,14 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
     layer_type = (LayerType.CONCAT_LAYER if is_concat_layer
                   else LayerType.CONCAT_PROJ_LAYER)
 
+    if layer_type == LayerType.CONCAT_LAYER:
+        assert not bias_attr
+    
     Layer(
         name=name, type=layer_type,
         inputs=[x.name for x in input] if is_concat_layer else input,
         active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
 
@@ -2950,6 +2954,103 @@ def conv_operator(img, filter, filter_size, num_filters,
     op.origin = [img, filter]
     return op
 
+@wrap_param_attr_default()
+def conv_projection(input, filter_size, num_filters,
+                    num_channels=None, stride=1, padding=0,
+                    filter_size_y=None, stride_y=None, padding_y=None,
+                    groups=1, param_attr=None):
+    """
+    ConvProjection with a layer as input.
+    It performs element-wise multiplication with weight.
+
+    Different from img_conv_layer and conv_op, conv_projection is an Projection,
+    which can be used in mixed_layer and conat_layer. It use cudnn to implement
+    conv and only support GPU mode.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = conv_projection(img=input1,
+                              filter_size=3,
+                              num_filters=64,
+                              num_channels=64)
+
+    :param input: input layer
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel.
+    :type filter_size: int
+    :param filter_size_y: The y dimension of a filter kernel. Since
+                          PaddlePaddle now supports rectangular filters,
+                          the filter's shape can be (filter_size, filter_size_y).
+    :type filter_size_y: int
+    :param num_filters: channel of output data.
+    :type num_filters: int
+    :param num_channel: channel of input data.
+    :type num_channel: int
+    :param stride: The x dimension of the stride.
+    :type stride: int
+    :param stride_y: The y dimension of the stride.
+    :type stride_y: int
+    :param padding: The x dimension of padding.
+    :type padding: int
+    :param padding_y: The y dimension of padding.
+    :type padding_y: int
+    :param groups: The group number.
+    :type groups: int
+    :param param_attr: Convolution param attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :return: A DotMulProjection Object.
+    :rtype: DotMulProjection
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if filter_size_y is None:
+        if isinstance(filter_size, collections.Sequence):
+            assert len(filter_size) == 2
+            filter_size, filter_size_y = filter_size
+        else:
+            filter_size_y = filter_size
+
+    if stride_y is None:
+        if isinstance(stride, collections.Sequence):
+            assert len(stride) == 2
+            stride, stride_y = stride
+        else:
+            stride_y = stride
+
+    if padding_y is None:
+        if isinstance(padding, collections.Sequence):
+            assert len(padding) == 2
+            padding, padding_y = padding
+        else:
+            padding_y = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
+        init_w = (2.0 / (filter_size ** 2 * num_channels)) ** 0.5
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
+
+    proj = ConvProjection(input_layer_name=input.name,
+                          num_filters=num_filters,
+                          conv_conf=Conv(filter_size=filter_size,
+                                         padding=padding,
+                                         stride=stride,
+                                         channels=num_channels,
+                                         filter_size_y=filter_size_y,
+                                         padding_y=padding_y,
+                                         stride_y=stride_y,
+                                         groups=groups),
+                          **param_attr.attr)
+
+    proj.origin = input
+    return proj
+
 
 @wrap_name_default()
 @layer_support()
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 65512b327c..bce88f9362 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -29,7 +29,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
            "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group',
            'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
            'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
-           'text_conv_pool',
+           'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
            'bidirectional_lstm', 'inputs', 'outputs']
 
 
@@ -811,22 +811,37 @@ def simple_gru(input,
                gru_layer_attr=None
                ):
     """
-    simple_gru is also a recurrent layer group version Gated Recurrent Unit as
-    gru_group. The difference only lies in implemention details.
+    You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
+    simple_gru in network.py. The reason why there are so many interfaces is
+    that we have two ways to implement recurrent neural network. One way is to
+    use one complete layer to implement rnn (including simple rnn, gru and lstm)
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
+    the multiplication operation :math:`W x_t` is not computed in these layers.
+    See details in their interfaces in layers.py. 
+    The other implementation is to use an recurrent group which can ensemble a
+    series of layers to compute rnn step by step. This way is flexible for
+    attenion mechanism or other complex connections.
+
+    - gru_step_layer: only compute rnn by one step. It needs an memory as input
+      and can be used in recurrent group.
+    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_group: a GRU cell implemented by a combination of multiple layers in
+      recurrent group.
+      But :math:`W x_t` is not done in group.  
+    - gru_memory: a GRU cell implemented by one layer, which does same calculation
+      with gru_group and is faster than gru_group. 
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+      gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
+      formula in grumemory. 
+
     The computational speed is that, grumemory is relatively better than
     gru_group, and gru_group is relatively better than simple_gru.
 
-    simple_gru does exactly the same calculation as the grumemory layer does.
-    Please see grumemory in layers.py for more detail about the maths.
-
     The example usage is:
 
     ..  code-block:: python
 
-        gru = gur_group(input=[layer1],
-                        size=256,
-                        act=TanhActivation(),
-                        gate_act=SigmoidActivation())
+        gru = simple_gru(input=[layer1], size=256)
 
     :param input: input layer name.
     :type input: LayerOutput
@@ -863,6 +878,132 @@ def simple_gru(input,
                      gru_layer_attr=gru_layer_attr)
 
 
+@wrap_name_default('simple_gru2')
+def simple_gru2(input,
+                size,
+                name=None,
+                reverse=False,
+                mixed_param_attr=None,
+                mixed_bias_attr=None,
+                gru_param_attr=None,
+                gru_bias_attr=None,
+                act=None,
+                gate_act=None,
+                mixed_layer_attr=None,
+                gru_cell_attr=None
+                ):
+    """
+    simple_gru2 is the same with simple_gru, but using grumemory instead
+    Please see grumemory in layers.py for more detail about the maths.
+    simple_gru2 is faster than simple_gru.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = simple_gru2(input=[layer1], size=256)
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
+    with mixed_layer(name='%s_transform' % name,
+                     size=size * 3,
+                     bias_attr=mixed_bias_attr,
+                     layer_attr=mixed_layer_attr) as m:
+        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
+
+    return grumemory(name=name,
+                     size=size,
+                     input=m,
+                     reverse=reverse,
+                     bias_attr=gru_bias_attr,
+                     param_attr=gru_param_attr,
+                     act=act,
+                     gate_act=gate_act,
+                     layer_attr=gru_cell_attr)
+
+
+@wrap_name_default("bidirectional_gru")
+def bidirectional_gru(input, size, name=None, return_seq=False,
+                      fwd_mixed_param_attr=None, fwd_mixed_bias_attr=None,
+                      fwd_gru_param_attr=None, fwd_gru_bias_attr=None,
+                      fwd_act=None, fwd_gate_act=None,
+                      fwd_mixed_layer_attr=None, fwd_gru_cell_attr=None,
+
+                      bwd_mixed_param_attr=None, bwd_mixed_bias_attr=None,
+                      bwd_gru_param_attr=None, bwd_gru_bias_attr=None,
+                      bwd_act=None, bwd_gate_act=None,
+                      bwd_mixed_layer_attr=None, bwd_gru_cell_attr=None,
+
+                      last_seq_attr=None, first_seq_attr=None,
+                      concat_attr=None, concat_act=None):
+    """
+    A bidirectional_gru is a recurrent unit that iterates over the input
+    sequence both in forward and bardward orders, and then concatenate two
+    outputs to form a final output. However, concatenation of two outputs
+    is not the only way to form the final output, you can also, for example,
+    just add them together.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        bi_gru = bidirectional_gru(input=[input1], size=512)
+
+    :param name: bidirectional gru layer name.
+    :type name: basestring
+    :param input: input layer.
+    :type input: LayerOutput
+    :param size: gru layer size.
+    :type size: int
+    :param return_seq: If set False, outputs of the last time step are
+                       concatenated and returned.
+                       If set True, the entire output sequences that are
+                       processed in forward and backward directions are
+                       concatenated and returned.
+    :type return_seq: bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    args = locals()
+
+    fw = simple_gru2(name='%s_fw' % name, input=input, size=size,
+                     **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('fwd_')))
+
+    bw = simple_gru2(name="%s_bw" % name, input=input, size=size,
+                     reverse=True,
+                     **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('bwd_')))
+
+    if return_seq:
+        return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
+                            act=concat_act)
+    else:
+        fw_seq = last_seq(name="%s_fw_last" % name, input=fw,
+                          layer_attr=last_seq_attr)
+        bw_seq = first_seq(name="%s_bw_last" % name, input=bw,
+                           layer_attr=first_seq_attr)
+        return concat_layer(name=name, input=[fw_seq, bw_seq],
+                            layer_attr=concat_attr, act=concat_act)
+
+
 @wrap_name_default("bidirectional_lstm")
 def bidirectional_lstm(input, size, name=None, return_seq=False,
                        fwd_mat_param_attr=None, fwd_bias_param_attr=None,
@@ -893,7 +1034,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
 
     ..  code-block:: python
 
-        lstm_step = bidirectional_lstm(input=[input1], size=512)
+        bi_lstm = bidirectional_lstm(input=[input1], size=512)
 
     :param name: bidirectional lstm layer name.
     :type name: basestring
@@ -907,7 +1048,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
                        processed in forward and backward directions are
                        concatenated and returned.
     :type return_seq: bool
-    :return: lstm layer name.
+    :return: LayerOutput object accroding to the return_seq.
     :rtype: LayerOutput
     """
     args = locals()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5
index d1b22b3490..72dfdad7bd 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
@@ -1,10 +1,11 @@
 86c0815275a9d5eb902e23c6a592f58a  img_layers.protostr
 a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
 9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
-5913f87b39cee3b2701fa158270aca26  projections.protostr
+34e04043cbb12931c47fa44ec50eeffc  projections.protostr
 7334ba0a4544f0623231330fc51d390d  shared_fc.protostr
-8b8b6bb128a7dfcc937be86145f53e2f  shared_lstm.protostr
+bb8e233b05b8e07f9ed386b7aee4f2c6  shared_lstm.protostr
 6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
+f98e79e1630d5eb827c300e64836d269  test_bi_grumemory.protostr
 0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
 6cd5f28a3416344f20120698470e0a4c  test_cost_layers_with_weight.protostr
 144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
@@ -15,7 +16,7 @@ d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
 5433ed33d4e7414eaf658f2a55946186  test_maxout.protostr
 251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
 e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
-2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr
+fded24727338fb8ce44d9951ed8aea08  test_rnn_group.protostr
 67d6fde3afb54f389d0ce4ff14726fe1  test_sequence_pooling.protostr
 f586a548ef4350ba1ed47a81859a64cb  unused_layers.protostr
-8122477f4f65244580cec09edc590041  util_layers.protostr
+f937a5a6e7e8864b4d8cf56b0f7c7f44  util_layers.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index 4b1d2d3d41..6a31ceabdf 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -9,7 +9,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_maxout)
+test_maxout test_bi_grumemory)
 
 
 for conf in ${configs[*]}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
new file mode 100644
index 0000000000..ab9f7c4948
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
@@ -0,0 +1,10 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='data', size=120)
+
+outputs(bidirectional_gru(input=din, size=40, return_seq=True))
-- 
GitLab