Merge pull request #73 from reyoung/merge_icode

Merge Baidu Internal Changes.

Merge pull request #73 from reyoung/merge_icode
Merge Baidu Internal Changes.
93006787 · emailweixu · GitHub · 487dc670 · c7762da3 · 93006787
26 changed file
--- a/doc/build/index.rst
+++ b/doc/build/index.rst
@@ -9,6 +9,7 @@ Install PaddlePaddle
    :glob:

    install_*
+    internal/install_from_jumbo.md

 Build from Source
 -----------------

--- a/doc/cluster/index.rst
+++ b/doc/cluster/index.rst
@@ -5,3 +5,4 @@ Cluster Train
  :glob:

  opensource/cluster_train.md
+  internal/index.md
--- a/doc_cn/build_and_install/index.rst
+++ b/doc_cn/build_and_install/index.rst
@@ -9,7 +9,11 @@ Note: The intallation packages are still in pre-release state and your experienc

 .. toctree::
   :maxdepth: 1
+   :glob:
   
+   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
+   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
+   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
   install/docker_install.rst 
   install/ubuntu_install.rst
   cmake/index.rst
--- a/doc_cn/cluster/index.rst
+++ b/doc_cn/cluster/index.rst
+集群训练
+========
+
+* `集群训练 <../../doc/cluster/index.html>`_
+
+.. toctree::
+    :maxdepth: 2
+    :glob:
+
+    集群训练(对内) <internal/index.md>
+
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -8,7 +8,7 @@ PaddlePaddle文档
 * `用户接口 <ui/index.html>`_
 * `使用示例 <demo/index.html>`_
 * `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
-* `集群训练 <../doc/cluster/index.html>`_
+* `集群训练 <cluster/index.html>`_

 开发指南
 --------

--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -194,8 +194,8 @@ public:
  virtual real evalImp(std::vector<Argument>& arguments) {
    CHECK_EQ(arguments.size(), (size_t)2);
    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false);
-    label.resizeAndCopyFrom(arguments[1], false);
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    CHECK(label.sequenceStartPositions);
    CHECK(label.ids);

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
 #pragma once

 #include "GradientMachine.h"
@@ -101,7 +100,7 @@ public:
   * Return true if this prefix or candidate is expected to be dropped.
   */
  typedef std::function<bool(int seqId, const std::vector<int>&,
-      const std::vector<real>&)> DropCallback;
+                             const std::vector<real>&)> DropCallback;

  /**
    * @brief NormOrDropNodeCallback
@@ -117,7 +116,7 @@ public:
    * The fourth parameter is the probability of the whole path.
    */
  typedef std::function<void(int seqId, const std::vector<int>&,
-      std::vector<real>&, real*)> NormOrDropNodeCallback;
+                             std::vector<real>&, real*)> NormOrDropNodeCallback;

  /**
   * @brief Register beam search control callbacks. Used for prediction.
@@ -192,7 +191,7 @@ public:

    int machineId;  // index of sample in frame
    int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;  // index of sequence in batch generation
+    int seqId;      // index of sequence in batch generation
    std::vector<int> machineIdVec;

    /**
@@ -206,7 +205,10 @@ public:
    /**
     * @brief Path default ctor, first logProb is 0.
     */
-    Path() { logProb = 0; seqId = 0; }
+    Path() {
+      logProb = 0;
+      seqId = 0;
+    }
    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }

    /**
@@ -319,21 +321,37 @@ protected:
  };
  std::vector<MemoryFrameLine> memoryFrameLines_;

-  // All inFrameLines and outFrameLines have the same element as follows.
+  // Each inFrameLines(inlinks) has its own info(elements) below,
+  // and all outFrameLines(outlinks) share the info with one inFrameLine,
+  // which is assigned by targetInfoInlinkId_.
  struct Info {
    IVectorPtr allIds;         // scattered id of realLayer
    std::vector<int> idIndex;  // index of allIds
    ICpuGpuVectorPtr
-        sequenceStartPositions;      // scattered sequenceStartPositions
+        sequenceStartPositions;         // scattered sequenceStartPositions
    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
  };
-  Info info_;
+  std::vector<Info> info_;
+
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  std::vector<int> numSeqs_;

-  // if no subSeq, tuple of (seqLength, seqStart, seqIndex, seqIndex)
-  // else, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
-  std::vector<std::tuple<int, int, int, int>> seqLengthAndStart_;
+  // each inlinks has a "std::vector<std::tuple<int, int, int, int>>" denotes
+  // its sequence info:
+  //  if hasSubSeq, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
+  //  else, tuple of (seqLength, seqStart, seqIndex, seqIndex)
+  std::vector<std::vector<std::tuple<int, int, int, int>>> seqLengthAndStart_;

-  void createInFrameInfo(const Argument& input, PassType passType);
+  // the id of inlink which share info with outlinks
+  int targetInfoInlinkId_;
+
+  /* create scattered id infomation for all realLayer of inFrameLines one time.
+  *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+  *  for all realLayer of inFrameLines one time.
+  */
+  void createInFrameInfo(int inlinks_id, const Argument& input,
+                         PassType passType);

  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                             PassType passType);
@@ -363,6 +381,9 @@ protected:

  NeuralNetwork* rootNetwork_;
  bool reversed_;
+
+  // if hasSubseq: max number of sentences(subseq)in batchsize samples
+  // else: max number of tokens in batchsize samples(sentences)
  int maxSequenceLength_;
  bool useGpu_;
  bool stopBeamSearch_;
@@ -415,7 +436,7 @@ private:
   * @param machineIdVec : select a row of output matrix in each frame
   * that the generation process expanded.
   */
-  void createDataOutlink(std::vector<int> & machineIdVec);
+  void createDataOutlink(std::vector<int>& machineIdVec);

  /*
   * @brief used in beam search, connect previous frame to form recurrent link

--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -49,8 +49,10 @@ void CTCLayer::forward(PassType passType) {
  Layer::forward(passType);
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
  } else {
    forwardImp(getInput(0), getInput(1));
@@ -92,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) {
  if (useGpu_) {
    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
+            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
  } else {
    backwardImp(callback, getInput(0), getInput(1));
  }

--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -248,7 +248,7 @@ void ConvOperator::forward() {
  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
  checkFilterSize(ins_[1]->value);
  Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_);
+                         outputH_ * outputW_ * numFilters_, false, useGpu_);
  {
    AsyncGpuBlock block;
    for (size_t batchId = 0; batchId < batchSize; ++batchId) {

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -509,8 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
                               Matrix &cost) {
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
  forwardImpIn(output, label, cost);
 }

--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -52,8 +52,10 @@ public:
    Layer::forward(passType);
    if (useGpu_) {
      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+        tmpCpuInput_[i].resizeAndCopyFrom(
+            getInput(i), false, HPPL_STREAM_DEFAULT);
      }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
      forwardImp(tmpCpuInput_[0]);
    } else {
      forwardImp(getInput(0));

--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
    testLayer->forward(PASS_TEST);
    Argument out;
    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    if (batchOut.value) {
      size_t dim = batchOut.value->getWidth();
      ASSERT_TRUE((bool)out.value);
@@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
    testLayer->forward(PASS_TEST);
    Argument out;
    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    if (batchOut.value) {
      size_t dim = batchOut.value->getWidth();
      ASSERT_TRUE((bool)out.value);

--- a/paddle/gserver/tests/Sequence/dummy.list
+++ b/paddle/gserver/tests/Sequence/dummy.list
+dummy_file_no_use
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(2)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(2)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
-#!/usr/bin/env python
-#coding=utf-8
-
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        return fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it correctly. Current implementation requires that
+    # all the out links are from sequences. However, it does not report error
+    # when the out links are not sequences.
+    return inner_rnn_output
+
+out = recurrent_group(
+    step=outer_step,
+    input=SubsequenceInput(emb))
+
+value_printer_evaluator(input=out)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ b/paddle/gserver/tests/sequence_rnn.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    return fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+
+out = recurrent_group(
+    step=step,
+    input=emb)
+
+value_printer_evaluator(input=out)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>

+P_DECLARE_int32(seed);
+
 using namespace paddle;  // NOLINT
 using namespace std;  // NOLINT
 class TrainerForTest : public paddle::Trainer {
@@ -68,7 +70,9 @@ void CalCost(const string& conf, const string& dir, real* cost,
  CpuVector vecMomentum(dim);

  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-  vecW.zeroMem();
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);

  trainer.startTrain();
  for (int i = 0; i < num_passes; ++i) {
@@ -88,15 +92,13 @@ void CalCost(const string& conf, const string& dir, real* cost,
  rmDir(dir.c_str());
 }

-TEST(RecurrentGradientMachine, HasSubSequence) {
+void test(const string& conf1, const string& conf2) {
  int num_passes = 5;
  real* cost1 = new real[num_passes];
-  const string conf1 = "gserver/tests/sequence_layer_group.conf";
  const string dir1 = "gserver/tests/t1";
  CalCost(conf1, dir1, cost1, num_passes);

  real* cost2 = new real[num_passes];
-  const string conf2 = "gserver/tests/sequence_nest_layer_group.conf";
  const string dir2 = "gserver/tests/t2";
  CalCost(conf2, dir2, cost2, num_passes);

@@ -109,6 +111,17 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
  delete[] cost2;
 }

+TEST(RecurrentGradientMachine, HasSubSequence) {
+  test("gserver/tests/sequence_layer_group.conf",
+       "gserver/tests/sequence_nest_layer_group.conf");
+}
+
+TEST(RecurrentGradientMachine, rnn) {
+  test("gserver/tests/sequence_rnn.conf",
+       "gserver/tests/sequence_nest_rnn.conf");
+}
+
+
 int main(int argc, char** argv) {
  if (paddle::version::isWithPyDataProvider()) {
    if (!paddle::version::isWithGpu()) {

--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
  Argument& cpuInput = testCpu.dataLayer_->getOutput();
  Argument& gpuInput = testGpu.dataLayer_->getOutput();
  gpuInput.resizeAndCopyFrom(cpuInput, true);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);

  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
  if (!matrix) {
    matrix = Matrix::create(height, width, trans, useGpu);
  } else {
+    CHECK_EQ(matrix->useGpu(), useGpu);
    matrix->resize(height, width);
  }
 }
@@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
  } else {
    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    CHECK_EQ(matrix->useGpu(), useGpu);
    matrix->resize(height, width, nnz, valueType, format);
  }
 }

--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
  } else if ((!useGpu) && (!cpuVectorT_)) {
    cpuVectorT_ = VectorT<T>::create(size, false);
  } else {
+    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
    this->resize(size, useGpu);
  }
 }

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -25,6 +25,7 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
    if (!dest) {
      dest = src->clone(0, 0, useGpu);
    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
      dest->resize(src->getHeight(), src->getWidth());
    }
    dest->copyFrom(*src, stream);
@@ -60,12 +61,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
  if (src) {
    CHECK_LE((size_t)startRow + copySize, src->getHeight());
-
    int height = copySize;
    int width = src->getWidth();
    if (!dest) {
      dest = src->clone(height, width, useGpu);
    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
      dest->resize(height, width);
    }
    MatrixPtr submat = src->subMatrix(startRow, copySize);
@@ -182,6 +183,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
  }
 }

+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
+   resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+}
+
 void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
                                 hl_stream_t stream) {
  dataId = src.dataId;
@@ -199,6 +205,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
  resizeAndCopy(strs, src.strs, useGpu, stream);
 }

+int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                                    int32_t copySize, bool useGpu) {
+    int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
+                                     HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    return size;
+}
+
 int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
                                    int32_t copySize, bool useGpu,
                                    hl_stream_t stream) {

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -203,13 +203,28 @@ struct Argument {
   *   startSeq: the sample id of start
   *   copySize: how many samples need to copy
   *   return value: how many samples are copied
+   * Note that when specifying the stream explicitly in this case,
+   * synchronize should also be called somewhere after this function
   */
  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu = FLAGS_use_gpu,
-                            hl_stream_t stream = HPPL_STREAM_DEFAULT);
+                            int32_t copySize, bool useGpu, hl_stream_t stream);

-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu,
-                         hl_stream_t stream = HPPL_STREAM_DEFAULT);
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                            int32_t copySize, bool useGpu = FLAGS_use_gpu);
+
+  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);

  /*
    @brief Concatenate several arguments into one and put the result into it.
@@ -240,6 +255,15 @@ struct Argument {

  /*
   Get Sequence Length, startPositions and max Length according to input
+   1. For sequence data:
+      Each tuple is (seq_length, seq_start, seq_id, seq_id)
+      The tuples are sorted according to seq_length or subseq_length
+      *maxSequenceLength is the maximal sequence length
+
+   2. For subsequence data:
+      Each tuple is (subseq_length, subseq_start, seq_id, subseq_id)
+      The tuples are not sorted. They are in the original order.
+      *maxSequenceLenth is the maximal number of subsequences in each sequence.
   */
  void getSeqLengthAndStart(
      std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,

--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -452,6 +452,9 @@ message SubModelConfig {
  repeated LinkConfig out_links = 10;

  optional GeneratorConfig generator = 11;
+
+  // the id of inlink which share info with outlinks, used in recurrent layer group
+  optional int32 target_inlinkid = 12;
 }

 message ModelConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -303,7 +303,8 @@ def MakeLayerNameInSubmodel(name, submodel_name = None):
 @config_func
 def RecurrentLayerGroupWithoutOutLinksBegin(name,
                                            in_links,
-                                            seq_reversed=False):
+                                            seq_reversed=False,
+                                            target_inlinkname=""):
    global g_current_submodel
    config_assert(g_config.model_config.type == "recurrent_nn",
                  "RecurrentLayerGroup should be used only in recurrent_nn")
@@ -311,14 +312,19 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
    SubModelBegin(name)
    g_current_submodel.is_recurrent_layer_group = True
    g_current_submodel.reversed = seq_reversed
+    g_current_submodel.target_inlinkid = -1
    in_links_count = 0
-    for link in in_links:
+    for linkid, link in enumerate(in_links):
        if isinstance(link, basestring):
            name = link
            has_subseq = False
        else:
            name = link.link_name
            has_subseq = link.has_subseq
+        # assign target_inlinkid according to target_inlinkname
+        if target_inlinkname == name:
+            g_current_submodel.target_inlinkid = linkid
+
        if in_links_count == 0:
            in_links_has_subseq = has_subseq
        else:
@@ -331,6 +337,7 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
            SequenceScatterAgentLayer(name=name, size=layer.size)
        else:
            ScatterAgentLayer(name=name, size=layer.size)
+
        pair = g_current_submodel.in_links.add()
        pair.layer_name = layer_name
        pair.link_name = MakeLayerNameInSubmodel(name)
@@ -362,10 +369,12 @@ def RecurrentLayerGroupBegin(name,
                             in_links,
                             out_links,
                             generator=None,
+                             target_inlinkname="",
                             seq_reversed=False):
    RecurrentLayerGroupWithoutOutLinksBegin(name,
                                            in_links,
-                                            seq_reversed)
+                                            seq_reversed,
+                                            target_inlinkname)
    for link in out_links:
        RecurrentLayerGroupSetOutLink(link)