diff --git a/.travis.yml b/.travis.yml
index a78853e15b15825354ffbc6e1ca8ffb10c5257c6..d3dae9efd416bd92dde9b327424544da401f2025 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,6 +2,9 @@ language: cpp
 cache: ccache
 sudo: required
 dist: trusty
+env:
+  - JOB=DOCS
+  - JOB=BUILD_AND_TEST
 addons:
   apt:
     packages:
@@ -16,6 +19,7 @@ addons:
       - python2.7-dev
       - m4
       - libprotobuf-dev
+      - doxygen
       - protobuf-compiler
       - python-protobuf
       - python-numpy
@@ -24,12 +28,10 @@ addons:
       - libgflags-dev
       - libgtest-dev
 before_install:
-  - pip install wheel protobuf
+  - pip install wheel protobuf sphinx breathe recommonmark
   - sudo paddle/scripts/travis/before_install.sh
 script:
-  - paddle/scripts/travis/build.sh
-  - paddle/scripts/travis/unittest.sh
-  - paddle/scripts/travis/make_install.sh
+  - paddle/scripts/travis/main.sh
 notifications:
   email:
     on_success: change
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
index b3d5fa7c9ff5f0b879e15b8017d029bc23e9ada8..10d5d86311333c223d1024f520fccddcb4c5050d 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -25,7 +25,7 @@ repo or just head straight to the command line:
  
 ```shell
 # Clone your fork to your local machine
-git clone git@github.com:USERNAME/paddle.git
+git clone git@github.com:USERNAME/Paddle.git
 ```
 Then you can start to develop. 
 
@@ -52,7 +52,7 @@ To do this, you'll need to add a remote at first:
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/paddle/paddle.git
+git remote add upstream https://github.com/baidu/Paddle.git
 # verify the new upstream
 git remote -v
 ```
diff --git a/doc/build/index.rst b/doc/build/index.rst
index 2b983dceb2777e6c79ee1efaa977fef6e5c33ad6..d6d0d19e110fc35faec87da90d784a6775b9c91f 100644
--- a/doc/build/index.rst
+++ b/doc/build/index.rst
@@ -9,6 +9,7 @@ Install PaddlePaddle
     :glob:
 
     install_*
+    internal/install_from_jumbo.md
 
 Build from Source
 -----------------
diff --git a/doc/cluster/index.rst b/doc/cluster/index.rst
index cf1ea97715402ec5b5b565a295ff4c1515df2570..9062f85f98d2981b5c8dcf8149e32c2ccdac77f4 100644
--- a/doc/cluster/index.rst
+++ b/doc/cluster/index.rst
@@ -5,3 +5,4 @@ Cluster Train
   :glob:
 
   opensource/cluster_train.md
+  internal/index.md
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index 1583fce981fed64141acdccc0d89b46b63d13cc0..f902d1c995bc5045d62d0b2e279ee612f9dc7c93 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -245,10 +245,10 @@ addto_layer
     :members: addto_layer
     :noindex:
 
-convex_comb_layer
+linear_comb_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
-    :members: convex_comb_layer
+    :members: linear_comb_layer
     :noindex:
 
 interpolation_layer
@@ -280,7 +280,13 @@ tensor_layer
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: tensor_layer
     :noindex:
-    
+
+cos_sim
+-------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cos_sim
+    :noindex:
+
 trans_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -341,12 +347,6 @@ rank_cost
     :members: rank_cost
     :noindex:
 
-cos_sim
--------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cos_sim
-    :noindex:
-
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst
index e9182903c5f62b3a96c196d5ba1ebba2fd14f669..e21fc98c63dcdcda8202dad349ffe24dda62492d 100644
--- a/doc_cn/build_and_install/index.rst
+++ b/doc_cn/build_and_install/index.rst
@@ -9,7 +9,11 @@ Note: The intallation packages are still in pre-release state and your experienc
 
 .. toctree::
    :maxdepth: 1
+   :glob:
    
+   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
+   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
+   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
    install/docker_install.rst 
    install/ubuntu_install.rst
    cmake/index.rst
diff --git a/doc_cn/cluster/index.rst b/doc_cn/cluster/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25313a9635bbf567a1aedfac3c379802d601d283
--- /dev/null
+++ b/doc_cn/cluster/index.rst
@@ -0,0 +1,11 @@
+集群训练
+========
+
+* `集群训练 <../../doc/cluster/index.html>`_
+
+.. toctree::
+    :maxdepth: 2
+    :glob:
+
+    集群训练(对内) <internal/index.md>
+
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
index 5f06463899f6b7b8166ff2cccd87b17817c6f5d1..6cf5588b5b34f5e80ea4c70cc364d4c6c42cce3d 100644
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -8,7 +8,7 @@ PaddlePaddle文档
 * `用户接口 <ui/index.html>`_
 * `使用示例 <demo/index.html>`_
 * `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
-* `集群训练 <../doc/cluster/index.html>`_
+* `集群训练 <cluster/index.html>`_
 
 开发指南
 --------
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 19c94b2453981301bcb632ecbe5d322369009973..c2dce1977bdf5daefb6c5b8032bb6b12563e9425 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -150,7 +150,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
 
 
 // APIs available after R4:
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
   __macro(cudnnBatchNormalizationForwardTraining)            \
   __macro(cudnnBatchNormalizationForwardInference)           \
@@ -999,7 +999,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
                                     double epsilon,
                                     real *savedMean,
                                     real *savedVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   if ((NULL != runningMean && NULL == runningInvVar) ||
       (NULL == runningMean && NULL != runningInvVar)) {
     LOG(FATAL) << "runningMean and runningInvVar can be NULL "
@@ -1024,7 +1024,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 
   CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1039,7 +1039,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
                                     real *estimatedMean,
                                     real *estimatedInvVar,
                                     double epsilon) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
   cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
   cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
@@ -1053,7 +1053,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 
   CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1071,7 +1071,7 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
                             double epsilon,
                             real *savedMean,
                             real *savedInvVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   if ((NULL != savedMean && NULL == savedInvVar) ||
       (NULL == savedMean && NULL != savedInvVar)) {
     LOG(FATAL) << "savedMean and savedVar can be NULL "
@@ -1087,16 +1087,14 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
   CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
               t_resource.cudnn_handle, mode, &alpha, &beta,
-#if CUDNN_VERSION >= 5000
               &alpha, &beta,
-#endif
               xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
               bnDesc, scale, scaleGrad, biasGrad, epsilon,
               savedMean, savedInvVar));
 
   CHECK_SYNC("hl_batch_norm_backward failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index fc003b7d6377d199c3859aa8b257de07478992af..ecc44944e4fa19b064fb0aa09d81e2143e5bc85d 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "hl_matrix_apply.cuh"
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
+#include "hl_device_functions.cuh"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index d0b1c0447d23d3e7072b2ee4f8e860708eb44bb2..e397c71c877dce8c34aefac12481373a037510f6 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -194,8 +194,8 @@ public:
   virtual real evalImp(std::vector<Argument>& arguments) {
     CHECK_EQ(arguments.size(), (size_t)2);
     Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false);
-    label.resizeAndCopyFrom(arguments[1], false);
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     CHECK(label.sequenceStartPositions);
     CHECK(label.ids);
@@ -207,7 +207,7 @@ public:
       real err = 0;
       err = editDistance(
           output.value->getData() + output.value->getWidth() * outputStarts[i],
-          output.value->getHeight(), output.value->getWidth(),
+          outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
           label.ids->getData() + labelStarts[i],
           labelStarts[i + 1] - labelStarts[i]);
 
@@ -224,6 +224,9 @@ public:
     for (const std::string& name : config_.input_layers()) {
       arguments.push_back(nn.getLayer(name)->getOutput());
     }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     numSequences_ += arguments[1].getNumSequences();
   }
 
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 7bc5fe51813c94a4347118f1366370ec8b867e02..bf7aa1c8d89aeff396a4ed094fc36043defeb1a5 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
@@ -291,6 +290,8 @@ void RecurrentGradientMachine::init(
   if (subModelConfig->evaluator_names_size() > 0) {
     evaluator_.reset(frames_[0]->makeEvaluator());
   }
+
+  targetInfoInlinkId_ = subModelConfig->target_inlinkid();
 }
 
 void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
@@ -325,7 +326,7 @@ void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
 
   for (int i = frames_.size(); i < numFrames; ++i) {
     std::unique_ptr<NeuralNetwork> frame(
-          NeuralNetwork::newNeuralNetwork(subModelName_));
+        NeuralNetwork::newNeuralNetwork(subModelName_));
     frame->init(config_, subParamInitCb);
 
     for (auto& inFrameLine : inFrameLines_) {
@@ -382,6 +383,16 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
   size_t numSequences = input.getNumSequences();
   const int* starts = input.sequenceStartPositions->getData(false);
   bool hasSubseq = input.hasSubseq();
+
+  // In case of !hasSubseq or targetInfoInlinkId_ == -1, all inlinks share the
+  // same inframe info
+  bool shareInlinkInfo = !hasSubseq || targetInfoInlinkId_ == -1;
+
+  // Defaultly, share info with the first inlink
+  if (shareInlinkInfo) {
+    targetInfoInlinkId_ = 0;
+  }
+
   // check hasSubseq in both config and input are the same
   CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq);
 
@@ -394,9 +405,17 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     CHECK_EQ((size_t)input1.getNumSequences(), numSequences);
     // check all inputs should have same hasSubseq flag
     CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq);
-    CHECK_EQ(input1.getBatchSize(), batchSize);
-    CHECK(std::equal(starts, starts + numSequences + 1,
-                     input1.sequenceStartPositions->getData(false)));
+
+    // if shareInlinkInfo, checks:
+    // 1. all inlinks have same number of total tokens
+    // 2. all inlinks have same number of tokens for each sentence of each
+    //    sample. If hasSubseq, one sample has multiple sentence, else, one
+    //    sample is one sentence
+    if (shareInlinkInfo) {
+      CHECK_EQ(input1.getBatchSize(), batchSize);
+      CHECK(std::equal(starts, starts + numSequences + 1,
+                       input1.sequenceStartPositions->getData(false)));
+    }
   }
 
   if (hasSubseq) {
@@ -408,19 +427,46 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     for (size_t i = 1; i < inFrameLines_.size(); ++i) {
       const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
       CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
-      CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
-                       input1.subSequenceStartPositions->getData(false)));
+      if (shareInlinkInfo) {
+        CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
+                         input1.subSequenceStartPositions->getData(false)));
+      }
     }
   }
 
-  seqLengthAndStart_.clear();
-  input.getSeqLengthAndStart(&seqLengthAndStart_, &maxSequenceLength_);
+  info_.clear();
+  info_.resize(inFrameLines_.size());
+
+  seqInfos_.clear();
+  seqInfos_.resize(inFrameLines_.size());
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    // if shareInlinkInfo, only calculate info of the first inlink
+    // else, calculate info for each inlink
+    if (shareInlinkInfo) {
+      input.getSeqInfo(&seqInfos_[0]);
+      maxSequenceLength_ = seqInfos_[0][0].topLevelLength;
+      createInFrameInfo(0, input, passType);
+    } else {
+      for (size_t i = 0; i < inFrameLines_.size(); i++) {
+        const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
+        input1.getSeqInfo(&seqInfos_[i]);
+        maxSequenceLength_ = seqInfos_[i][0].topLevelLength;
+        createInFrameInfo(i, input1, passType);
+      }
+    }
+
+    // inFrameLine select rows in real layer one time
+    for (size_t i = 0; i < inFrameLines_.size(); i++) {
+      int curInlinkId = shareInlinkInfo ? 0 : i;
+      selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds,
+                        &(inFrameLines_[i].outArg), passType);
+    }
+  }
   resizeOrCreateFrames(maxSequenceLength_);
   resizeBootFrame(numSequences);
 
-  AsyncGpuBlock asyncGpuBlock;
-  createInFrameInfo(input, passType);
-
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
@@ -443,23 +489,29 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     auto gatherAgent =
         dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
     CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(input, info_.allIds, info_.idIndex);
+    gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds,
+                                       info_[targetInfoInlinkId_].idIndex);
   }
 
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    int idSize = info_.idIndex[i + 1] - info_.idIndex[i];
-
+    int idSize = 0;
     // connect in_links
-    for (auto& inFrameLine : inFrameLines_) {
+    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
+      // idSize denotes the sum number of tokens in each length i
+      idSize = info_[j].idIndex[i + 1] - info_[j].idIndex[i];
+      InFrameLine inFrameLine = inFrameLines_[j];
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
       scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg, info_.allIds,
-                                          info_.idIndex[i], idSize);
+                                          inFrameLine.outArg, info_[j].allIds,
+                                          info_[j].idIndex[i], idSize);
       if (hasSubseq) {
-        int size = info_.seqStartPosIndex[i + 1] - info_.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(
-            info_.sequenceStartPositions, info_.seqStartPosIndex[i], size);
+        // size: the length of subsequence
+        int size =
+            info_[j].seqStartPosIndex[i + 1] - info_[j].seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(info_[j].sequenceStartPositions,
+                                                info_[j].seqStartPosIndex[i],
+                                                size);
       }
     }
 
@@ -469,13 +521,16 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
           dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
       gatherAgent->addRealLayer(outFrameLine.frames[i]);
     }
-
     // connect memory links
+    // Adopt info_[0].idIndex because seq which has_subseq=True
+    // doesn't support Memory with !hasSubseq bootlayer;
+    // And inlinks that !hasSubSeq must have same inlink length.
+    idSize = info_[0].idIndex[i + 1] - info_[0].idIndex[i];
     for (auto& memoryFrameLine : memoryFrameLines_) {
       NeuralNetwork::connect(
           memoryFrameLine.agents[i],
           i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          idSize /*height of agent*/);
+          numSeqs_[i] /*height of agent*/);
     }
   }
 
@@ -560,62 +615,77 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
  * If hasSubseq, will also create scattered sequenceStartPositions infomation
  * for all realLayer of inFrameLines one time.
 */
-void RecurrentGradientMachine::createInFrameInfo(const Argument& input,
+
+void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
+                                                 const Argument& input,
                                                  PassType passType) {
   bool hasSubseq = input.hasSubseq();
+  // numSequences: # samples(sequences) in a batch
   size_t numSequences = input.getNumSequences();
   std::vector<int> allIds;
-  info_.idIndex.clear();
-  info_.idIndex.push_back(0);  // first idIndex = 0
-  if (hasSubseq) {             // for sequenceScatterAgentLayer
-    size_t numSubSequences = input.getNumSubSequences();
-    std::vector<int> sequenceStartPositions;
-    info_.seqStartPosIndex.clear();
-    info_.seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-    for (int i = 0; i < maxSequenceLength_; ++i) {
-      sequenceStartPositions.push_back(0);  // first element = 0
-      for (size_t j = 0; j < numSubSequences; ++j) {
-        if (std::get<3>(seqLengthAndStart_[j]) == i) {
-          int subSeqStart = std::get<1>(seqLengthAndStart_[j]);
-          int subSeqLength = std::get<0>(seqLengthAndStart_[j]);
-          for (int k = subSeqStart; k < subSeqStart + subSeqLength; ++k) {
-            allIds.push_back(k);
-          }
-          sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                              subSeqLength);
-        }
-      }
-      info_.idIndex.push_back(allIds.size());
-      info_.seqStartPosIndex.push_back(sequenceStartPositions.size());
+
+  auto& seqInfo = seqInfos_[inlinkId];
+
+  numSeqs_.clear();
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.clear();
+  inlinkInfo->idIndex.push_back(0);  // first idIndex = 0
+
+  std::vector<int> sequenceStartPositions;
+  const int* subSequenceStartPositions = nullptr;
+
+  if (hasSubseq) {                    // for sequenceScatterAgentLayer
+    subSequenceStartPositions =
+        input.subSequenceStartPositions->getData(false);
+    inlinkInfo->seqStartPosIndex.clear();
+    inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
+  }
+  // maxSequenceLength_: max topLevelLength in allsamples
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    if (hasSubseq) {
+      sequenceStartPositions.push_back(0);            // first element = 0
     }
-    // inFrameLine create sequenceStartPositions one time
-    CHECK_EQ(sequenceStartPositions.size(),
-             maxSequenceLength_ + numSubSequences);
-    CHECK_EQ(info_.seqStartPosIndex.size(),
-             static_cast<size_t>(maxSequenceLength_ + 1));
-    createSeqPos(sequenceStartPositions, &info_.sequenceStartPositions);
-  } else {  // for scatterAgentLayer
-    for (int i = 0; i < maxSequenceLength_; ++i) {
-      for (size_t j = 0; j < numSequences; ++j) {
-        int seqLength = std::get<0>(seqLengthAndStart_[j]);
-        if (i >= seqLength) {
-          break;
+    int numSeqs = 0;
+    for (size_t j = 0; j < numSequences; ++j) {
+      int seqLength = seqInfo[j].topLevelLength;
+      if (i >= seqLength) {
+        break;
+      }
+      ++numSeqs;
+      if (hasSubseq) {
+        int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
+        int subSeqEnd =
+            subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
+        for (int k = subSeqStart; k < subSeqEnd; ++k) {
+          allIds.push_back(k);
         }
-        int seqStart = std::get<1>(seqLengthAndStart_[j]);
+        sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                         subSeqEnd - subSeqStart);
+      } else {
+        int seqStart = seqInfo[j].seqStart;
         allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
                                    : (seqStart + i));
       }
-      info_.idIndex.push_back(allIds.size());
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+    numSeqs_.push_back(numSeqs);
+    if (hasSubseq) {
+      inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
     }
   }
-  // copy and check scatterId
-  copyScattedId(allIds, &info_.allIds, input.getBatchSize());
-  CHECK_EQ(info_.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-  // inFrameLine select rows in real layer one time
-  for (auto& inFrameLine : inFrameLines_) {
-    selectRowsOneTime(inFrameLine.inLayer, info_.allIds, &inFrameLine.outArg,
-                      passType);
+  if (hasSubseq) {
+    // inFrameLine create sequenceStartPositions one time
+    CHECK_EQ(sequenceStartPositions.size(),
+             maxSequenceLength_ + input.getNumSubSequences());
+    CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+             static_cast<size_t>(maxSequenceLength_ + 1));
+    createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
   }
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
 }
 
 /* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
@@ -633,19 +703,20 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
     sequenceStartPositions.push_back(0);  // first element = 0
     const int* starts = input.sequenceStartPositions->getData(false);
     for (size_t i = 0; i < numSequences; ++i) {
-      int seqId = std::get<2>(seqLengthAndStart_[i]);
+      // memory info adopt info of inlinks[0]
+      int seqId = seqInfos_[0][i].seqId;
       for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
         allIds.push_back(k);
       }
       sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                          starts[seqId + 1] - starts[seqId]);
+                                       starts[seqId + 1] - starts[seqId]);
     }
     createSeqPos(sequenceStartPositions,
                  &(*memoryFrameLine).sequenceStartPositions);
 
   } else {  // for scatterAgentLayer
     for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(std::get<2>(seqLengthAndStart_[i]));
+      allIds.push_back(seqInfos_[0][i].seqId);
     }
   }
   // copy and check scatterId
@@ -699,18 +770,19 @@ size_t RecurrentGradientMachine::getGenBatchSize() {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (!memoryFrameLine.rootLayer) continue;
     Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = memoryFrameLine.is_sequence ?
-                       bootArg.getNumSequences() : bootArg.getBatchSize();
+    size_t batchSize = memoryFrameLine.is_sequence ? bootArg.getNumSequences()
+                                                   : bootArg.getBatchSize();
     if (numSequences) {
       CHECK_EQ(numSequences, batchSize);
     } else {
       numSequences = batchSize;
     }
   }
-  CHECK(numSequences) << "Fail to get batch size in generation. "
-    "At least one of the Memory layer MUST have a layer that is NOT in "
-    "the layer group to boot it, and this boot layer is used to "
-    "decide batch_size in generation process.";
+  CHECK(numSequences)
+      << "Fail to get batch size in generation. "
+         "At least one of the Memory layer MUST have a layer that is NOT in "
+         "the layer group to boot it, and this boot layer is used to "
+         "decide batch_size in generation process.";
   return numSequences;
 }
 
@@ -732,7 +804,9 @@ void RecurrentGradientMachine::generateSequence() {
 
   // connect boot frame memory links
   std::vector<int> ids(numSequences);
-  for (size_t i = 0; i < numSequences; ++i) { ids[i] = i; }
+  for (size_t i = 0; i < numSequences; ++i) {
+    ids[i] = i;
+  }
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
@@ -756,7 +830,8 @@ void RecurrentGradientMachine::generateSequence() {
 
   // init outArg
   size_t resultNum = generator_.config.num_results_per_sample();
-  IVector::resizeOrCreate(generator_.outArg.ids,
+  IVector::resizeOrCreate(
+      generator_.outArg.ids,
       generator_.config.max_num_frames() * numSequences * resultNum, false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
@@ -847,7 +922,9 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
         // path.seqId = -1 indicates end of generation
         // of an input sequence
         finalPaths[seqIds_[j]].seqId = -1;
-      } else { scatterIds.push_back(j); }
+      } else {
+        scatterIds.push_back(j);
+      }
     }
   }
 
@@ -856,13 +933,12 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   starts[0] = 0;
   generator_.ids.clear();
   for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(),
-                          finalPaths[i].ids.begin(),
+    generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(),
                           finalPaths[i].ids.end());
     starts[i + 1] = generator_.ids.size();
     batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                             finalPaths[i].machineIdVec.begin(),
-                             finalPaths[i].machineIdVec.end());
+                              finalPaths[i].machineIdVec.begin(),
+                              finalPaths[i].machineIdVec.end());
   }
 }
 
@@ -920,9 +996,9 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   }
 }
 
-void RecurrentGradientMachine::singlePathExpand(
-    Path& curPath, size_t curPathId, std::vector<Path>& newPaths,
-    size_t expandWidth) {
+void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
+                                                std::vector<Path>& newPaths,
+                                                size_t expandWidth) {
   int calc_id =
       gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
 
@@ -946,19 +1022,20 @@ void RecurrentGradientMachine::singlePathExpand(
     if (id == -1) break;
 
     real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(curPath, id, newLogProb,
-                 curPathId /*machineId*/, k /*topIndex*/);
+    Path newPath(curPath, id, newLogProb, curPathId /*machineId*/,
+                 k /*topIndex*/);
     if (this->beamSearchCtrlCallbacks_) {
       if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
-              newPath.seqId, newPath.ids, newPath.probHistory)) return;
+              newPath.seqId, newPath.ids, newPath.probHistory))
+        return;
     }
     // outFrameLines_.size() > 1UL
     if (dataArgsSize_) {
       newPath.machineIdVec = curPath.machineIdVec;
       newPath.machineIdVec.push_back(curPathId);
     }
-    bool atEos = eosVec[index] == 1U ||
-                 newPath.ids.size() >= (size_t)maxSequenceLength_;
+    bool atEos =
+        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
     // adjustNewPath
     newPath.adjustProb(calc_id, atEos);
     if (this->beamSearchCtrlCallbacks_) {
@@ -966,16 +1043,18 @@ void RecurrentGradientMachine::singlePathExpand(
           newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
     }
     if (!newPath.isDropable()) {
-      atEos ? finalPaths_[curPath.seqId].push_back(newPath) :
-              newPaths.push_back(newPath);
+      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
+            : newPaths.push_back(newPath);
     }
   }  // for expandWidth
 
-  if (gDiyProbStop) { gDiyProbStop(calc_id); }
+  if (gDiyProbStop) {
+    gDiyProbStop(calc_id);
+  }
 }
 
-void RecurrentGradientMachine::beamExpand(
-    std::vector<Path>& paths, std::vector<Path>& newPaths) {
+void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
+                                          std::vector<Path>& newPaths) {
   size_t candidatePathCount = paths.size();
   // idVec.size() could be larger than candidatePathCount * beam,
   // so user can drop some node customly.
@@ -988,7 +1067,7 @@ void RecurrentGradientMachine::beamExpand(
   int curSeqId = 0;
   for (size_t j = 0; j <= candidatePathCount; j++) {
     // expansions of a single sequence are all processed
-    curSeqId = (j < candidatePathCount? paths[j].seqId : curSeqId + 1);
+    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
     if (prevSeqId != -1 && curSeqId != prevSeqId) {
       totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
     }
@@ -1000,11 +1079,14 @@ void RecurrentGradientMachine::beamExpand(
 }
 
 // Drop extra nodes to beam size.
-size_t RecurrentGradientMachine::beamShrink(
-    std::vector<Path>& newPaths, size_t seqId, size_t totalExpandCount) {
-  size_t minNewPathSize = std::min(getBeamSize(),
-                                   newPaths.size() - totalExpandCount);
-  if (!minNewPathSize) { return 0; }
+size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
+                                            size_t seqId,
+                                            size_t totalExpandCount) {
+  size_t minNewPathSize =
+      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
+  if (!minNewPathSize) {
+    return 0;
+  }
   std::nth_element(newPaths.begin() + totalExpandCount,
                    newPaths.begin() + totalExpandCount + minNewPathSize,
                    newPaths.end(), Path::greaterPath);
@@ -1017,11 +1099,8 @@ size_t RecurrentGradientMachine::beamShrink(
 
   // Remove the already formed paths that are relatively short
   finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(),
-                     finalPaths_[seqId].end(),
-                     [&](Path& p) {
-                         return p.logProb < minPathLogProb;
-                     }),
+      std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(),
+                     [&](Path& p) { return p.logProb < minPathLogProb; }),
       finalPaths_[seqId].end());
   for (auto p : finalPaths_[seqId]) {
     if (minFinalPathLogProb_[seqId] > p.logProb) {
@@ -1030,7 +1109,7 @@ size_t RecurrentGradientMachine::beamShrink(
   }
 
   if (finalPaths_[seqId].size() >= getBeamSize() &&
-          minFinalPathLogProb_[seqId] >= maxPathLogProb) {
+      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
     newPaths.resize(totalExpandCount);
     return 0;
   }
@@ -1067,7 +1146,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
           // in beam search, here only reserved the top 1 generated result
           // for out_links that are not the generated word indices.
           batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-              path.machineIdVec.begin(), path.machineIdVec.end());
+                                    path.machineIdVec.begin(),
+                                    path.machineIdVec.end());
         }
       }
       starts[i + 1] = generator_.ids.size();
@@ -1091,21 +1171,21 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
 
 void RecurrentGradientMachine::createDataOutlink(
     std::vector<int>& machineIdVec) {
-  size_t seqNum = getBeamSize() > 1UL ?
-                  finalPaths_.size() : finalPaths_[0].size();
+  size_t seqNum =
+      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
   std::vector<int> starts(seqNum + 1, 0);
   for (size_t i = 0; i < seqNum; ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() :
-                                          finalPaths_[0][i].ids.size();
+    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
+                                        : finalPaths_[0][i].ids.size();
     starts[i + 1] = starts[i] + seqLen;
   }
 
   for (size_t i = 0; i < dataArgsSize_; i++) {
-    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec,
-                        starts, useGpu_, HPPL_STREAM_1, PASS_TEST);
+    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_,
+                        HPPL_STREAM_1, PASS_TEST);
 
-    auto dataAgent = dynamic_cast<DataLayer*>(
-        outFrameLines_[i + 1].agentLayer.get());
+    auto dataAgent =
+        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
     CHECK_NOTNULL(dataAgent);
     dataAgent->setData(dataArgs_[i]);
   }
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index cc49d13952323db6e514ea437552d076187d91e2..6328213793ed6ca39214ec00124570ecb1ce273b 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "GradientMachine.h"
@@ -101,7 +100,7 @@ public:
    * Return true if this prefix or candidate is expected to be dropped.
    */
   typedef std::function<bool(int seqId, const std::vector<int>&,
-      const std::vector<real>&)> DropCallback;
+                             const std::vector<real>&)> DropCallback;
 
   /**
     * @brief NormOrDropNodeCallback
@@ -117,7 +116,7 @@ public:
     * The fourth parameter is the probability of the whole path.
     */
   typedef std::function<void(int seqId, const std::vector<int>&,
-      std::vector<real>&, real*)> NormOrDropNodeCallback;
+                             std::vector<real>&, real*)> NormOrDropNodeCallback;
 
   /**
    * @brief Register beam search control callbacks. Used for prediction.
@@ -192,7 +191,7 @@ public:
 
     int machineId;  // index of sample in frame
     int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;  // index of sequence in batch generation
+    int seqId;      // index of sequence in batch generation
     std::vector<int> machineIdVec;
 
     /**
@@ -206,7 +205,10 @@ public:
     /**
      * @brief Path default ctor, first logProb is 0.
      */
-    Path() { logProb = 0; seqId = 0; }
+    Path() {
+      logProb = 0;
+      seqId = 0;
+    }
     explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
 
     /**
@@ -319,21 +321,33 @@ protected:
   };
   std::vector<MemoryFrameLine> memoryFrameLines_;
 
-  // All inFrameLines and outFrameLines have the same element as follows.
+  // Each inFrameLines(inlinks) has its own info(elements) below,
+  // and all outFrameLines(outlinks) share the info with one inFrameLine,
+  // which is assigned by targetInfoInlinkId_.
   struct Info {
     IVectorPtr allIds;         // scattered id of realLayer
     std::vector<int> idIndex;  // index of allIds
     ICpuGpuVectorPtr
-        sequenceStartPositions;      // scattered sequenceStartPositions
+        sequenceStartPositions;         // scattered sequenceStartPositions
     std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
   };
-  Info info_;
+  std::vector<Info> info_;
+
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  std::vector<int> numSeqs_;
 
-  // if no subSeq, tuple of (seqLength, seqStart, seqIndex, seqIndex)
-  // else, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
-  std::vector<std::tuple<int, int, int, int>> seqLengthAndStart_;
+  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
 
-  void createInFrameInfo(const Argument& input, PassType passType);
+  // the id of inlink which share info with outlinks
+  int targetInfoInlinkId_;
+
+  /* create scattered id infomation for all realLayer of inFrameLines one time.
+  *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+  *  for all realLayer of inFrameLines one time.
+  */
+  void createInFrameInfo(int inlinks_id, const Argument& input,
+                         PassType passType);
 
   void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                              PassType passType);
@@ -363,6 +377,9 @@ protected:
 
   NeuralNetwork* rootNetwork_;
   bool reversed_;
+
+  // if hasSubseq: max number of sentences(subseq)in batchsize samples
+  // else: max number of tokens in batchsize samples(sentences)
   int maxSequenceLength_;
   bool useGpu_;
   bool stopBeamSearch_;
@@ -415,7 +432,7 @@ private:
    * @param machineIdVec : select a row of output matrix in each frame
    * that the generation process expanded.
    */
-  void createDataOutlink(std::vector<int> & machineIdVec);
+  void createDataOutlink(std::vector<int>& machineIdVec);
 
   /*
    * @brief used in beam search, connect previous frame to form recurrent link
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp
index db1450694ecf7608fb37790e841b967288378e1f..6b9ffc5c749fb45be567881b8e625b48e28f69b4 100644
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -49,8 +49,10 @@ void CTCLayer::forward(PassType passType) {
   Layer::forward(passType);
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
     }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
   } else {
     forwardImp(getInput(0), getInput(1));
@@ -92,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) {
   if (useGpu_) {
     backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
     const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
     const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
   } else {
     backwardImp(callback, getInput(0), getInput(1));
   }
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index d08c422764e5642816a94fc55b5b67445ffb42f7..8c72c1778451dfddbaa740921cd08cf73fe56785 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -248,7 +248,7 @@ void ConvOperator::forward() {
   CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
   checkFilterSize(ins_[1]->value);
   Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_);
+                         outputH_ * outputW_ * numFilters_, false, useGpu_);
   {
     AsyncGpuBlock block;
     for (size_t batchId = 0; batchId < batchSize; ++batchId) {
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index e092b2e390f37cd322db8bed8273f561fa979791..a81cf939af671f3fb34fb52ae33035a7bb524aed 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -21,18 +21,20 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief A layer for convex weighted average of vectors,
+ * @brief A layer for weighted sum of vectors,
  * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
  * TRANSLATE
- * - Input: the first input contains the convex weights (batchSize x weightDim),
- *          and the shape of second input is (batchSize x (weightdim*dataDim)).
- * - Output: the shape of output is (batchSize x dataDim).
+ * - Input: the the size of the first input is weightDim,
+ *          and the size of the second input is weightdim * dataDim.
+ * - Output: the sizeof the output is dataDim
  * \f[
- *   out[i][j] = \sum_{j}(in0(i, j) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+ *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
  * \f]
+ * Note that the above computation is for one sample. Multiple samples are
+ * processed in one batch.
  *
- * The config file api is convex_comb_layer.
+ * The config file api is linear_comb_layer.
  */
 class ConvexCombinationLayer : public Layer {
 protected:
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index b10bd1d886ecf42170914c619b7b4040d984501d..05a70aeff5e8ff3789bca966d351bffc8efb1cb3 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -48,7 +48,7 @@ void CosSimLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
     MatrixPtr prevOut1 = getInputValue(0);
     MatrixPtr prevOut2 = getInputValue(1);
-    outV->cosSim(*prevOut1, *prevOut2, kCosSimScale_);
+    outV->cosSim(*prevOut1, *prevOut2, config_.cos_scale());
   }
 }
 
@@ -59,7 +59,7 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
 
     outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
                            *getInputValue(1), *getInputGrad(0),
-                           *getInputGrad(1), kCosSimScale_);
+                           *getInputGrad(1), config_.cos_scale());
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 9b0e53335b2503513ce11a4ab19f2199acfee499..65eb807ab2e6f16aab5ef2a9b08d697868c743a3 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
 class CosSimLayer : public Layer {
 public:
   explicit CosSimLayer(const LayerConfig& config)
-      : Layer(config), kCosSimScale_(5.0f) {}
+      : Layer(config) {}
 
   ~CosSimLayer() {}
 
@@ -44,8 +44,6 @@ public:
 
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
-
-  const real kCosSimScale_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index f353afabb3b7162783fef4f9093630fb826c86cb..0f99aee03200c3834c7c27343f41f77edc5a558e 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -509,8 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
                                Matrix &cost) {
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
     }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
   forwardImpIn(output, label, cost);
 }
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index cef8772fc254f98d676e6fb89042487315280c61..3c6d13b0bf92ea98eb5c3331a1fdff6b177529b6 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -115,29 +115,11 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
     create(tmpBiasGrad_, 1, channels_, &betaGrad);
   }
 
-  // because of the different api of cudnn v4 and v5.
-  if (hl_get_cudnn_lib_version() < 5000) {
-    if (weight_->getWGrad()) {
-      create(tmpWGrad_, 1, channels_, &gammaGrad);
-    }
-    if (biases_ && biases_->getWGrad()) {
-      create(tmpBiasGrad_, 1, channels_, &betaGrad);
-    }
-  }
-
   hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
                          ioDesc_, inGrad, bnParamDesc_,
                          gamma, gammaGrad, betaGrad,
                          EPS, savedMean, savedInvVar);
 
-  // because of the different api of cudnn v4 and v5.
-  if (hl_get_cudnn_lib_version() < 5000) {
-    if (weight_->getWGrad() && biases_->getWGrad()) {
-      weight_->getWGrad()->add(*tmpWGrad_);
-      biases_->getWGrad()->add(*tmpBiasGrad_);
-    }
-  }
-
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68fee69f44d0c2c144f6dde6fd8ff36bd96094f6
--- /dev/null
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class PrintLayer : public Layer {
+public:
+  explicit PrintLayer(const LayerConfig& config)
+      : Layer(config) {}
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback) {}
+};
+
+void PrintLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const auto& argu = getInput(i);
+    const std::string& name = inputLayers_[i]->getName();
+    if (argu.value) {
+      std::ostringstream os;
+      argu.value->print(os);
+      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
+    }
+    if (argu.ids) {
+      std::ostringstream os;
+      argu.ids->print(os, argu.ids->getSize());
+      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
+    }
+    if (auto startPos = argu.sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
+    }
+    if (auto subStartPos = argu.subSequenceStartPositions) {
+      std::ostringstream os;
+      subStartPos->getVector(false)->print(os, subStartPos->getSize());
+      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
+                << os.str();
+    }
+  }
+}
+
+REGISTER_LAYER(print, PrintLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 41c1461967ae1c0ff3c4b3a11e8f7405b58f6ab9..b39c9948b53118b51090059fc554e76f94316f81 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -52,8 +52,10 @@ public:
     Layer::forward(passType);
     if (useGpu_) {
       for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+        tmpCpuInput_[i].resizeAndCopyFrom(
+            getInput(i), false, HPPL_STREAM_DEFAULT);
       }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
       forwardImp(tmpCpuInput_[0]);
     } else {
       forwardImp(getInput(0));
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index f72011ae16cb3bac73e8acd5338bd7a179da329b..552a6c5b41c7f896c52b2132578b136200967573 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
     testLayer->forward(PASS_TEST);
     Argument out;
     out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     if (batchOut.value) {
       size_t dim = batchOut.value->getWidth();
       ASSERT_TRUE((bool)out.value);
@@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
     testLayer->forward(PASS_TEST);
     Argument out;
     out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     if (batchOut.value) {
       size_t dim = batchOut.value->getWidth();
       ASSERT_TRUE((bool)out.value);
diff --git a/paddle/gserver/tests/Sequence/dummy.list b/paddle/gserver/tests/Sequence/dummy.list
new file mode 100644
index 0000000000000000000000000000000000000000..0e52665e11298965df5738f69c5bcefcc8bab0f9
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/dummy.list
@@ -0,0 +1 @@
+dummy_file_no_use
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..85a83554c5c3045d144ee0250d2808237eccc9e0
--- /dev/null
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(2)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(2)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index e4727e472d446b48e6001968841bfc178e34ec0c..cb83d79d78cc677d5ffeb77f5693d08da2a51668 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python
-#coding=utf-8
-
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..62b8c5d072d7b42e46504defeff12f7e101384a0
--- /dev/null
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -0,0 +1,76 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it correctly. Current implementation requires that
+    # all the out links are from sequences. However, it does not report error
+    # when the out links are not sequences.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=SubsequenceInput(emb))
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..3294c2c3fc431c9d07aad0ba4620ec97a435fd91
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn.conf
@@ -0,0 +1,57 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=emb)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 8e857781468fed694dbd061d896263bf05303260..3a591a316b8bafccac9c59ff28e57b4e27f8377a 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -87,18 +87,31 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
         return;
     }
 
+    ICpuGpuVectorPtr sequenceStartPositions;
+    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
+        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
+      if (!sequenceStartPositions) {
+        generateSequenceStartPositions(batchSize, sequenceStartPositions);
+      }
+      data.sequenceStartPositions = sequenceStartPositions;
+    }
+
     arguments.push_back(data);
   }
 
   Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
   double totalScore = 0.0;
+  testEvaluator->start();
   totalScore += testEvaluator->evalImp(arguments);
   testEvaluator->updateSamplesNum(arguments);
+  testEvaluator->finish();
   LOG(INFO) << *testEvaluator;
 
   double totalScore2 = 0.0;
   if (testConf.testAccumulate) {
+    testEvaluator->start();
     totalScore2 += testEvaluator->evalImp(arguments);
+    testEvaluator->finish();
     EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
   }
 }
@@ -202,6 +215,15 @@ TEST(Evaluator, precision_recall) {
                 false);
 }
 
+TEST(Evaluator, ctc_error_evaluator) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("ctc_edit_distance");
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
+  testEvaluatorAll(config, "ctc_error_evaluator", 100);
+}
+
 int main(int argc, char** argv) {
   initMain(argc, argv);
   FLAGS_thread_local_rand_use_global_seed = true;
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 35d6ee7f4a402d198dbcd1df7b272dcd65723659..b73fdd18abf35858a366552120e69c8a039a4726 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
 
+P_DECLARE_int32(seed);
+
 using namespace paddle;  // NOLINT
 using namespace std;  // NOLINT
 class TrainerForTest : public paddle::Trainer {
@@ -68,7 +70,9 @@ void CalCost(const string& conf, const string& dir, real* cost,
   CpuVector vecMomentum(dim);
 
   // vecW needs to be assigned, otherwise the variable is an uncertain value.
-  vecW.zeroMem();
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);
 
   trainer.startTrain();
   for (int i = 0; i < num_passes; ++i) {
@@ -88,27 +92,39 @@ void CalCost(const string& conf, const string& dir, real* cost,
   rmDir(dir.c_str());
 }
 
-TEST(RecurrentGradientMachine, HasSubSequence) {
+void test(const string& conf1, const string& conf2, double eps) {
   int num_passes = 5;
   real* cost1 = new real[num_passes];
-  const string conf1 = "gserver/tests/sequence_layer_group.conf";
   const string dir1 = "gserver/tests/t1";
   CalCost(conf1, dir1, cost1, num_passes);
 
   real* cost2 = new real[num_passes];
-  const string conf2 = "gserver/tests/sequence_nest_layer_group.conf";
   const string dir2 = "gserver/tests/t2";
   CalCost(conf2, dir2, cost2, num_passes);
 
   for (int i = 0; i < num_passes; i++) {
     LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i];
-    ASSERT_NEAR(cost1[i], cost2[i], 1e-3);
+              << ", cost2=" << cost2[i]
+              << ", diff=" << std::abs(cost1[i] - cost2[i]);
+    ASSERT_NEAR(cost1[i], cost2[i], eps);
   }
   delete[] cost1;
   delete[] cost2;
 }
 
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  test("gserver/tests/sequence_layer_group.conf",
+       "gserver/tests/sequence_nest_layer_group.conf",
+       1e-5);
+}
+
+TEST(RecurrentGradientMachine, rnn) {
+  test("gserver/tests/sequence_rnn.conf",
+       "gserver/tests/sequence_nest_rnn.conf",
+       0);
+}
+
+
 int main(int argc, char** argv) {
   if (paddle::version::isWithPyDataProvider()) {
     if (!paddle::version::isWithGpu()) {
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 2cea190b859496cd635fc5a8d1834779537d50e6..9b933b153d158bef565c0964232525ba99b8b3d4 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   Argument& cpuInput = testCpu.dataLayer_->getOutput();
   Argument& gpuInput = testGpu.dataLayer_->getOutput();
   gpuInput.resizeAndCopyFrom(cpuInput, true);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
 
   const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
   const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index f3a6503d4a21ff8766f3289f8eee992d4d13045d..1b7f9ac5dac16c167dcc22930c28bc3521162b9b 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
   if (!matrix) {
     matrix = Matrix::create(height, width, trans, useGpu);
   } else {
+    CHECK_EQ(matrix->useGpu(), useGpu);
     matrix->resize(height, width);
   }
 }
@@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
   } else {
     CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
           dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    CHECK_EQ(matrix->useGpu(), useGpu);
     matrix->resize(height, width, nnz, valueType, format);
   }
 }
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index b1a459b86aa4ff70e4e07267c8a902123f9d17c0..7553ea25e09d2f52f1f8b9205f954510b77cbfa9 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
   } else if ((!useGpu) && (!cpuVectorT_)) {
     cpuVectorT_ = VectorT<T>::create(size, false);
   } else {
+    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
     this->resize(size, useGpu);
   }
 }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 8610a66452358e1b2e2a846ddfcf62a0ce99e22e..0ca56b29b39b317d01d80631e332ba02356a613d 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -25,6 +25,7 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
     if (!dest) {
       dest = src->clone(0, 0, useGpu);
     } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
       dest->resize(src->getHeight(), src->getWidth());
     }
     dest->copyFrom(*src, stream);
@@ -60,12 +61,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startRow + copySize, src->getHeight());
-
     int height = copySize;
     int width = src->getWidth();
     if (!dest) {
       dest = src->clone(height, width, useGpu);
     } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
       dest->resize(height, width);
     }
     MatrixPtr submat = src->subMatrix(startRow, copySize);
@@ -182,6 +183,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
   }
 }
 
+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
+   resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+}
+
 void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
                                  hl_stream_t stream) {
   dataId = src.dataId;
@@ -199,6 +205,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
   resizeAndCopy(strs, src.strs, useGpu, stream);
 }
 
+int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                                    int32_t copySize, bool useGpu) {
+    int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
+                                     HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    return size;
+}
+
 int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
                                     int32_t copySize, bool useGpu,
                                     hl_stream_t stream) {
@@ -463,51 +477,34 @@ void Argument::splitByDataId(const std::vector<Argument>& argus,
   }
 }
 
-void Argument::getSeqLengthAndStart(
-    std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
-    int* maxSequenceLength) const {
+void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
   const int* starts = sequenceStartPositions->getData(false);
-  if (hasSubseq()) {
-    size_t numSubSequences = getNumSubSequences();
-    (*seqLengthAndStart).reserve(numSubSequences);
-    const int* subStarts = subSequenceStartPositions->getData(false);
-    int seqIndex = 0;
-    int subSeqIndex = 0;
-    *maxSequenceLength = 0;
-    for (size_t i = 0; i < numSubSequences; ++i) {
-      if (subStarts[i] == starts[seqIndex]) {
-        subSeqIndex = 0;
-        (*seqLengthAndStart)
-            .push_back(std::make_tuple<int, int, int, int>(
-                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
-                (int)seqIndex, (int)subSeqIndex));
-        ++subSeqIndex;
-        ++seqIndex;
-      } else if (subStarts[i] < starts[seqIndex]) {
-        (*seqLengthAndStart)
-            .push_back(std::make_tuple<int, int, int, int>(
-                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
-                (int)seqIndex - 1, (int)subSeqIndex));
-        ++subSeqIndex;
+  const int* subStarts = hasSubseq()
+      ? subSequenceStartPositions->getData(false) : nullptr;
+  size_t numSequences = getNumSequences();
+  seqInfo->reserve(numSequences);
+  int subSeqEnd = 0;
+  for (size_t i = 0; i < numSequences; ++i) {
+    SeqInfo info;
+    info.seqStart = starts[i];
+    info.subLevelLength = starts[i + 1] - starts[i];
+    info.seqId = i;
+    if (hasSubseq()) {
+      info.subSeqStart = subSeqEnd;
+      while (subStarts[subSeqEnd] < starts[i + 1]) {
+        ++subSeqEnd;
       }
-      // maxSequenceLength_ = 1 + max(subSeqIndex) in each Seq.
-      if (*maxSequenceLength < std::get<3>((*seqLengthAndStart)[i]))
-        *maxSequenceLength = std::get<3>((*seqLengthAndStart)[i]);
-    }
-    *maxSequenceLength += 1;
-  } else {
-    size_t numSequences = getNumSequences();
-    (*seqLengthAndStart).reserve(numSequences);
-    for (size_t i = 0; i < numSequences; ++i) {
-      (*seqLengthAndStart)
-          .push_back(std::make_tuple<int, int, int, int>(
-              starts[i + 1] - starts[i], (int)starts[i], (int)i, (int)i));
+      info.topLevelLength = subSeqEnd - info.subSeqStart;
+    } else {
+      info.topLevelLength = info.subLevelLength;
+      info.subSeqStart = 0;  // not used
     }
-    std::sort((*seqLengthAndStart).begin(), (*seqLengthAndStart).end(),
-              std::greater<std::tuple<int, int, int, int>>());
-
-    *maxSequenceLength = std::get<0>((*seqLengthAndStart)[0]);
+    seqInfo->push_back(info);
   }
+  std::sort(seqInfo->begin(), seqInfo->end(),
+            [](const SeqInfo& a, const SeqInfo& b) {
+              return a.topLevelLength > b.topLevelLength;
+            });
 }
 
 void Argument::checkSubset() const {
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index c444ebaf12930e938a3a4d75541d0fbf5bbb01ac..81cd117fc45cfa34da0810b01c5a710d9ce5950b 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -203,13 +203,28 @@ struct Argument {
    *   startSeq: the sample id of start
    *   copySize: how many samples need to copy
    *   return value: how many samples are copied
+   * Note that when specifying the stream explicitly in this case,
+   * synchronize should also be called somewhere after this function
    */
   int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu = FLAGS_use_gpu,
-                            hl_stream_t stream = HPPL_STREAM_DEFAULT);
+                            int32_t copySize, bool useGpu, hl_stream_t stream);
 
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu,
-                         hl_stream_t stream = HPPL_STREAM_DEFAULT);
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                            int32_t copySize, bool useGpu = FLAGS_use_gpu);
+
+  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
 
   /*
     @brief Concatenate several arguments into one and put the result into it.
@@ -238,12 +253,29 @@ struct Argument {
   static void splitByDataId(const std::vector<Argument>& argus,
                             std::vector<std::vector<Argument>>* arguGroups);
 
+  struct SeqInfo {
+    // Equal to sequence length for sequence data
+    // Equal to number of subsequences for subsequence data
+    int topLevelLength;
+
+    int seqStart;
+    int seqId;
+
+    // Equal to topLevelLength for sequence data
+    // Equal to sum of the length of subsequences for subsequence data
+    int subLevelLength;
+
+    // Only used for subsequence data, start position of this sequence
+    // is subSequenceStartPositions, i.e.
+    // subSequenceStartPositions[subSeqStart] == seqStart
+    int subSeqStart;
+  };
   /*
-   Get Sequence Length, startPositions and max Length according to input
-   */
-  void getSeqLengthAndStart(
-      std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
-      int* maxSequenceLength) const;
+    Get SeqInfo for each sequence of this argument
+    Elements in *seqInfo are sorted by topLevelLength in descending order
+  */
+  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
+
   /*
    Check Whether sequenceStartPositions is subset of
    subSequenceStartPositions.
diff --git a/paddle/scripts/travis/build.sh b/paddle/scripts/travis/build_and_test.sh
similarity index 60%
rename from paddle/scripts/travis/build.sh
rename to paddle/scripts/travis/build_and_test.sh
index a644f2a4164f870dc88af9b8f357f5a3fb306d7d..3ea633be327027cc2093ad3a68158af1cfb097e7 100755
--- a/paddle/scripts/travis/build.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
-cd `dirname $0`
 source ./common.sh
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON
 make -j `nproc`
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
+sudo make install
+sudo paddle version
diff --git a/paddle/scripts/travis/deploy_key.enc b/paddle/scripts/travis/deploy_key.enc
new file mode 100644
index 0000000000000000000000000000000000000000..b0aa45c5ac626c735735fd8541a43bf8b099d0a0
Binary files /dev/null and b/paddle/scripts/travis/deploy_key.enc differ
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c2a4809d75b97a9d8d8b83cf197e90bd62b48603
--- /dev/null
+++ b/paddle/scripts/travis/docs.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Add set -e, cd to directory.
+source ./common.sh
+
+# Compile Documentation only.
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+make paddle_docs paddle_docs_cn
+
+# Parse Github URL
+REPO=`git config remote.origin.url`
+SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
+SHA=`git rev-parse --verify HEAD`
+
+# Documentation branch name
+# gh-pages branch is used for PaddlePaddle.org. The English version of 
+# documentation in `doc` directory, and the chinese version in `doc_cn`
+# directory.
+TARGET_BRANCH="gh-pages"
+
+# Only deploy master branch to build latest documentation.
+SOURCE_BRANCH="master"
+
+# If is not a Github pull request, and in master branch.
+if [ "$TRAVIS_PULL_REQUEST" != "false" -o "$TRAVIS_BRANCH" != "$SOURCE_BRANCH"  ]; then
+  exit 0
+fi
+
+# Clone the repo to output directory
+git clone $REPO output
+cd output
+
+# checkout github page branch
+git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+
+# remove old docs. mv new docs.
+rm -rf doc doc_cn
+mv ../doc_cn/html doc_cn
+mv ../doc/html doc
+
+# Check is there anything changed.
+set +e
+git diff --exit-code >/dev/null
+if [ $? -eq 0 ]; then
+  echo "No changes to the output on this push; exiting."
+  exit 0
+fi
+set -e
+
+# Commit
+git add .
+git config user.name "Travis CI"
+git config user.email "paddle-dev@baidu.com"
+git commit -m "Deploy to GitHub Pages: ${SHA}"
+
+# Set ssh private key
+openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
+chmod 600 deploy_key
+eval `ssh-agent -s`
+ssh-add deploy_key
+
+# Push
+git push $SSH_REPO $TARGET_BRANCH
diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c49d4546c24ac9304cd6f3c5940ed3d1d32ebb3d
--- /dev/null
+++ b/paddle/scripts/travis/main.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+cd `dirname $0`
+
+if [ ${JOB} == "BUILD_AND_TEST" ]; then
+  ./build_and_test.sh
+elif [ ${JOB} == "DOCS" ]; then
+  ./docs.sh
+else
+  echo Unknown job ${JOB}
+  exit 1
+fi
diff --git a/paddle/scripts/travis/make_install.sh b/paddle/scripts/travis/make_install.sh
deleted file mode 100755
index 08b2a648bb97de2c4f39c64efb9a41829faae0be..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/make_install.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-source ./common.sh
-sudo make install
-sudo paddle version
diff --git a/paddle/scripts/travis/unittest.sh b/paddle/scripts/travis/unittest.sh
deleted file mode 100755
index 45e8c85c1028efb98433ebc383931def30fae416..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/unittest.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-source ./common.sh
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
-
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index d04620d363c14923455d68734b03ef9bb3f28f78..a2b243a7869eaff120b25ece35e95be4d4284d18 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -452,6 +452,9 @@ message SubModelConfig {
   repeated LinkConfig out_links = 10;
 
   optional GeneratorConfig generator = 11;
+
+  // the id of inlink which share info with outlinks, used in recurrent layer group
+  optional int32 target_inlinkid = 12;
 }
 
 message ModelConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 68cc40247041093d3eec6dc93b22d358f4cbbaa1..fd9a003bb018c87fb8e8e2992390f27edfd72f4b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -22,6 +22,8 @@ find_python_module(pip REQUIRED)
 find_python_module(wheel REQUIRED)
 find_python_module(google.protobuf REQUIRED)
 
+add_subdirectory(paddle/trainer_config_helpers/tests)
+
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
 )
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index b26a63e7f3c1d2fdfc5fea0a034a2f2c5238d1f0..f2f67f9bd66a4ebab9b5ace7fb13a194959d6c10 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -303,7 +303,8 @@ def MakeLayerNameInSubmodel(name, submodel_name = None):
 @config_func
 def RecurrentLayerGroupWithoutOutLinksBegin(name,
                                             in_links,
-                                            seq_reversed=False):
+                                            seq_reversed=False,
+                                            target_inlinkname=""):
     global g_current_submodel
     config_assert(g_config.model_config.type == "recurrent_nn",
                   "RecurrentLayerGroup should be used only in recurrent_nn")
@@ -311,14 +312,19 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
     SubModelBegin(name)
     g_current_submodel.is_recurrent_layer_group = True
     g_current_submodel.reversed = seq_reversed
+    g_current_submodel.target_inlinkid = -1
     in_links_count = 0
-    for link in in_links:
+    for linkid, link in enumerate(in_links):
         if isinstance(link, basestring):
             name = link
             has_subseq = False
         else:
             name = link.link_name
             has_subseq = link.has_subseq
+        # assign target_inlinkid according to target_inlinkname
+        if target_inlinkname == name:
+            g_current_submodel.target_inlinkid = linkid
+
         if in_links_count == 0:
             in_links_has_subseq = has_subseq
         else:
@@ -331,6 +337,7 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
             SequenceScatterAgentLayer(name=name, size=layer.size)
         else:
             ScatterAgentLayer(name=name, size=layer.size)
+
         pair = g_current_submodel.in_links.add()
         pair.layer_name = layer_name
         pair.link_name = MakeLayerNameInSubmodel(name)
@@ -362,10 +369,12 @@ def RecurrentLayerGroupBegin(name,
                              in_links,
                              out_links,
                              generator=None,
+                             target_inlinkname="",
                              seq_reversed=False):
     RecurrentLayerGroupWithoutOutLinksBegin(name,
                                             in_links,
-                                            seq_reversed)
+                                            seq_reversed,
+                                            target_inlinkname)
     for link in out_links:
         RecurrentLayerGroupSetOutLink(link)
 
@@ -1399,6 +1408,14 @@ class SelectiveFCLayer(LayerBase):
                 input_index, psize, dims, sparse, format)
         self.create_bias_parameter(bias, self.config.size)
 
+@config_layer('print')
+class PrintLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs):
+        super(PrintLayer, self).__init__(name, 'print', 0, inputs)
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(
@@ -1614,7 +1631,7 @@ class BatchNormLayer(LayerBase):
         # Also based on cudnn version.
         use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
             ((not parallel_nn) or self.config.device > -1) and \
-            cudnn_version >= 4000
+            cudnn_version >= 4007
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
         super(BatchNormLayer, self).__init__(name, self.layer_type, 0,
                                              active_type=active_type,
@@ -2264,6 +2281,9 @@ class ConvexCombinationLayer(LayerBase):
            name, 'convex_comb', size, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2,
           'ConvexCombinationLayer must have 2 inputs')
+        config_assert(
+            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'Wrong input size for ConvexCombinationLayer')
         self.set_layer_size(size)
 
 @config_layer('interpolation')
@@ -2313,6 +2333,9 @@ class CosSimVecMatLayer(LayerBase):
         self.config.cos_scale = cos_scale
         config_assert(len(self.inputs) == 2,
           'CosSimVecMatLayer must have 2 inputs')
+        config_assert(
+            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'Wrong input size for CosSimVecMatLayer')
 
 @config_layer('sampling_id')
 class SamplingIdLayer(LayerBase):
@@ -2361,6 +2384,7 @@ class CosSimLayer(LayerBase):
             self,
             name,
             inputs,
+            cos_scale=5,
             device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
@@ -2368,6 +2392,7 @@ class CosSimLayer(LayerBase):
         config_assert(
             self.get_input_layer(0).size == self.get_input_layer(1).size,
             'inputs of CosSimLayer must have same dim')
+        self.config.cos_scale = cos_scale
 
 
 @config_layer('tensor')
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 956bedadd75e5f389c25c37c1a466a0c3cb97430..985fae9f955c950d861d4f1f2f98845562fb6fc9 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -94,7 +94,7 @@ def evaluator_base(
          Batch=200 samples=20000 AvgCost=0.679655 CurrentCost=0.662179 Eval:
          classification_error_evaluator=0.4486
          CurrentEval: ErrorRate=0.3964
-         
+
     :param input: Input layers, a object of LayerOutput or a list of
                   LayerOutput.
     :type input: list|LayerOutput
@@ -296,6 +296,7 @@ def precision_recall_evaluator(
 @wrap_name_default()
 def ctc_error_evaluator(
         input,
+        label,
         name=None,
         ):
     """
@@ -305,16 +306,20 @@ def ctc_error_evaluator(
 
     .. code-block:: python
 
-       eval = ctc_error_evaluator(input)
+       eval = ctc_error_evaluator(input=input, label=lbl)
 
     :param name: Evaluator name.
     :type name: None|basestring
-    :param input: Input Layer.
+    :param input: Input Layer. Should be the same as the input for ctc_layer.
     :type input: LayerOutput
+    :param label: input label, which is a data_layer. Should be the same as the
+                  label for ctc_layer
+    :type label: LayerOutput
     """
     evaluator_base(name=name,
                    type="ctc_edit_distance",
-                   input=input)
+                   input=input,
+                   label=label)
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
 @wrap_name_default()
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b7e5f566bb8c39fa6ea9ed491f28fa046bba71ee..bda0b4f5d60e82c1d577b0063fd5e164bf6117c3 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -21,7 +21,6 @@ from .evaluators import *
 from .poolings import MaxPooling, AvgPooling, BasePoolingType
 from .attrs import *
 from .default_decorators import *
-
 try:
     import cPickle as pickle
 except ImportError:
@@ -47,11 +46,12 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            'BaseGeneratedInput', 'conv_operator', 'conv_shift_layer',
            'tensor_layer', 'selective_fc_layer', 'sampling_id_layer',
            'slope_intercept_layer', 'trans_full_matrix_projection',
+           'linear_comb_layer',
            'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
            'cross_entropy_with_selfnorm', 'cross_entropy',
            'multi_binary_label_cross_entropy',
            'rank_cost', 'lambda_cost', 'huber_cost',
-           'block_expand_layer',
+           'block_expand_layer', 'out_prod_layer', 'print_layer'
            ]
 
 
@@ -70,7 +70,8 @@ class LayerType(object):
     POOLING_AVG = 'average'
     FC_LAYER = "fc"
     COST = 'cost'
-    COSINE_SIM = 'cos_vm'
+    COSINE_SIM_VEC = 'cos_vm'
+    COSINE_SIM = 'cos'
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = "conv"
     POOL_LAYER = "pool"
@@ -91,6 +92,7 @@ class LayerType(object):
     POWER_LAYER = 'power'
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
+    OUT_PROD_LAYER = 'out_prod'
 
     MEMORY = 'memory'
     MAXID_LAYER = 'maxid'
@@ -102,9 +104,11 @@ class LayerType(object):
     SEL_FC_LAYER = "selective_fc"
     SAMPLING_ID_LAYER = "sampling_id"
     SLOPE_INTERCEPT_LAYER = "slope_intercept"
-    CONVEX_COMBINATION_LAYER = "convex_comb"
+    LINEAR_COMBINATION_LAYER = "convex_comb"
     BLOCK_EXPAND = "blockexpand"
 
+    PRINT_LAYER = "print"
+
     CTC_LAYER = "ctc"
     CRF_LAYER = "crf"
     CRF_DECODING_LAYER = "crf_decoding"
@@ -171,6 +175,8 @@ class LayerOutput(object):
         assert LayerType.is_layer_type(layer_type)
         self.name = name
         self.layer_type = layer_type
+        if parents is not None and type(parents) != list:
+            parents = [parents]
         self.parents = [] if parents is None else parents
         self.activation = activation
         self.num_filters = num_filters
@@ -197,6 +203,25 @@ ERROR_CLIPPING = 'error_clipping_threshold'
 DROPOUT = 'drop_rate'
 
 
+def check_input(input):
+    """
+    Check input is a LayerOutput or list of LayerOutput or tuple of LayerOutput
+    if is a LayerOutput,
+
+    :param input: The input layer. Could be a list/tuple of input layer.
+    :type input: LayerOutput|list|tuple
+    :return: list of LayerOutput
+    :rtype: list of LayerOutput
+    """
+
+    if isinstance(input, LayerOutput):
+        return [LayerOutput]
+    assert isinstance(input, list)
+    for inp in input:
+        assert isinstance(inp, LayerOutput)
+    return list(input)
+
+
 def layer_support(*attrs):
     def decorator(method):
         @functools.wraps(method)
@@ -512,7 +537,7 @@ class MixedLayerType(LayerOutput):
         :rtype: MixedLayerType
         """
         if not self.finalized:
-            assert isinstance(other, Projection)
+            assert isinstance(other, Projection) or isinstance(other, Operator)
             self.inputs.append(other)
             self.parents.append(other.origin)
             return self
@@ -725,6 +750,27 @@ def fc_layer(input, size, act=None, name=None,
                        size=size)
 
 
+@wrap_name_default("print")
+def print_layer(input, name=None):
+    """
+    Print the output value of input layers. This layer is useful for debugging.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer. Could be a list/tuple of input layer.
+    :type input: LayerOutput|list|tuple
+    :return: No return
+    """
+    check_input(input)
+
+    Layer(
+        name=name,
+        type=LayerType.PRINT_LAYER,
+        inputs=[l.name for l in input],
+    )
+    LayerOutput(name, LayerType.PRINT_LAYER, input)
+
+
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
@@ -1169,13 +1215,16 @@ def power_layer(input, weight, name=None, layer_attr=None):
 @layer_support()
 def scaling_layer(input, weight, name=None, layer_attr=None):
     """
-    A layer for each row of a matrix, multiplying with a element of a vector.
+    A layer for multiplying input vector by weight scalar.
 
     .. math::
-       y.row[i] = w[i] * x.row[i]
+       y  = w x
+
+    where :math:`x` is size=dataDim input, :math:`w` is size=1 weight,
+    and :math:`y` is size=dataDim output.
 
-    where :math:`x` is (batchSize x dataDim) input, :math:`w` is
-    (batchSize x 1) weight vector, and :math:`y` is (batchSize x dataDim) output.
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The example usage is:
 
@@ -1249,11 +1298,14 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
 
     ..  math::
         similarity = cos(\\theta) = {\\mathbf{a} \\cdot \\mathbf{b}
-        \\over \\|\\mathbf{b}\\| \\|\\mathbf{b}\\|}
+        \\over \\|\\mathbf{a}\\| \\|\\mathbf{b}\\|}
 
-    And the input dimension is :math:`a \in R^M`, :math:`b \in R^{MN}`. The
-    similarity will be calculated N times by step M. The output dimension is
-    :math:`R^N`. The scale will be multiplied to similarity.
+    The size of a is M, size of b is M*N,
+    Similarity will be calculated N times by step M. The output size is
+    N. The scale will be multiplied to similarity.
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     :param name: layer name
     :type name: basestring
@@ -1270,14 +1322,23 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(
-        name=name,
-        type=LayerType.COSINE_SIM,
-        size=size,
-        cos_scale=scale,
-        inputs=[a.name, b.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
+    if size == 1:
+        Layer(
+            name=name,
+            type=LayerType.COSINE_SIM,
+            cos_scale=scale,
+            inputs=[a.name, b.name],
+            **ExtraLayerAttribute.to_kwargs(layer_attr)
+        )
+    else:
+        Layer(
+            name=name,
+            type=LayerType.COSINE_SIM_VEC,
+            size=size,
+            cos_scale=scale,
+            inputs=[a.name, b.name],
+            **ExtraLayerAttribute.to_kwargs(layer_attr)
+        )
     return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b])
 
 @wrap_name_default()
@@ -2326,6 +2387,39 @@ def maxid_layer(input, name=None, layer_attr=None):
                        layer_type=LayerType.MAXID_LAYER,
                        parents=[input])
 
+@wrap_name_default()
+def out_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the outer product of two vectors
+    The result is a matrix of size(input1) x size(input2)
+
+    The example usage is:
+
+    .. code-block:: python
+
+       out_prod = out_prod_layer(input1=vec1, input2=vec2)
+
+    :param name: Layer name.
+    :type name: basestring
+    :param input1: The first input layer name.
+    :type input: LayerOutput
+    :param input2: The second input layer name.
+    :type input2: LayerOutput
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    Layer(name=name,
+          type="out_prod",
+          inputs=[input1.name, input2.name],
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name=name,
+                       layer_type=LayerType.OUT_PROD_LAYER,
+                       parents=[input1,input2])
 
 @wrap_name_default()
 def eos_layer(input, eos_id, name=None, layer_attr=None):
@@ -2909,29 +3003,37 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
 
 
 @wrap_name_default()
-def convex_comb_layer(input, size, name=None):
+def linear_comb_layer(weights, vectors, size, name=None):
     """
-    A layer for convex weighted average of vectors takes two inputs.
-      - Input: a vector containing the convex weights (batchSize x weightdim),
-               and a matrix in a vector form (batchSize x (weightdim * datadim)).
-      - Output: a vector (batchSize * datadim).
+    A layer for weighted sum of vectors takes two inputs.
+      - Input: size of weights is M
+               size of vectors is M*N
+      - Output: a vector of size=N
 
     .. math::
 
-       y[i][j] = \sum_{j}(x_{1}(i, j) * x_{2}(i,j + i * dataDim)),
+       z(i) = \sum_{j=0}^{M-1} x(j) y(i+Nj)
+    where :math:`0 \le i \le N-1`
+
+    Or in the matrix notation:
 
-                   i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+    .. math::
+
+       z = x^\mathrm{T} Y
 
     In this formular:
-      - :math:`x_{1}`: the first input.
-      - :math:`x_{2}`: the second input.
-      - :math:`y`: the output.
+      - :math:`x`: weights
+      - :math:`y`: vectors.
+      - :math:`z`: the output.
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The simple usage is:
 
     .. code-block:: python
 
-       convex_comb = convex_comb_layer(input=inputs,
+       linear_comb = linear_comb_layer(weighs=weight, vectors=vectors,
                                        size=elem_dim)
 
     :param input: The input layers.
@@ -2944,15 +3046,16 @@ def convex_comb_layer(input, size, name=None):
     :rtype: LayerOutput
     """
 
-    assert isinstance(input, list) or isinstance(input, tuple)
-    assert len(input) == 2
     Layer(
         name=name,
-        type=LayerType.CONVEX_COMBINATION_LAYER,
+        type=LayerType.LINEAR_COMBINATION_LAYER,
         size=size,
-        inputs=[Input(input[0].name), Input(input[1].name)],
+        inputs=[Input(weights.name), Input(vectors.name)],
     )
-    return LayerOutput(name, LayerType.CONVEX_COMBINATION_LAYER, input, size=size)
+    return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
+                       [weights, vectors], size=size)
+
+convex_comb_layer = linear_comb_layer
 
 @wrap_name_default()
 def block_expand_layer(input,
@@ -3036,6 +3139,17 @@ def ctc_layer(input, label, size, name=None, norm_by_times=False):
     classication task. That is, for sequence labeling problems where the
     alignment between the inputs and the target labels is unknown.
 
+    More details can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
+
+    Note:
+        Considering the 'blank' label needed by CTC, you need to use
+        (num_classes + 1) as the input size. num_classes is the category number.
+        And the 'blank' is the last category index. So the size of 'input' layer, such as
+        fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
+        should also be num_classes + 1.
+
     The simple usage:
 
     .. code-block:: python
@@ -3049,7 +3163,7 @@ def ctc_layer(input, label, size, name=None, norm_by_times=False):
     :type input: LayerOutput
     :param label: The data layer of label with variable length.
     :type label: LayerOutput
-    :param size: category numbers.
+    :param size: category numbers + 1.
     :type size: int
     :param name: The name of this layer, which can not specify.
     :type name: string|None
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..611fb855a8c9ad6679167105dd737c995b23c209
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -0,0 +1,5 @@
+#################### test_config_parser #########################
+add_test(NAME layers_test
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
+    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b55667354750066a7d3ab3a0af59eb9e7d47d86
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config_and_serialize
+
+if __name__ == '__main__':
+    parse_config_and_serialize(
+        'trainer_config_helpers/tests/layers_test_config.py', '')
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..39c85c788eecad5c6bba6dbd2f2734725fa4fff6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+num_classes = 5
+
+x = data_layer(name="input1", size=3)
+y = data_layer(name="input2", size=5)
+
+z = out_prod_layer(input1=x, input2=y)
+
+x1 = fc_layer(input=x, size=5)
+y1 = fc_layer(input=y, size=5)
+y2 = fc_layer(input=y, size=15)
+
+cos1 = cos_sim(a=x1, b=y1)
+cos3 = cos_sim(a=x1, b=y2, size=3)
+
+linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
+
+out = fc_layer(input=[cos1, cos3, linear_comb, z],
+               size=num_classes,
+               act=SoftmaxActivation())
+
+print_layer(input=[out])
+
+outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
+
+# for ctc
+tmp = fc_layer(input=x1,
+               size=num_classes + 1,
+               act=SoftmaxActivation())
+ctc = ctc_layer(input=tmp,
+                label=y,
+                size=num_classes + 1)
+ctc_eval = ctc_error_evaluator(input=tmp, label=y)
+
+settings(
+    batch_size=10,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)