diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index e000bb2e5d6e96a92711ff9e244807132dcd88d5..96b0e19880b68fa2fb0ac5ad55af6d00ed401224 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -519,7 +519,6 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
           dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
       gatherAgent->addRealLayer(outFrameLine.frames[i]);
     }
-
     // connect memory links
     // Adopt info_[0].idIndex because seq which has_subseq=True
     // doesn't support Memory with !hasSubseq bootlayer;
@@ -529,7 +528,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       NeuralNetwork::connect(
           memoryFrameLine.agents[i],
           i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          idSize /*height of agent*/);
+          numSeqs_[i] /*height of agent*/);
     }
   }
 
@@ -622,6 +621,8 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
   // numSequences: # samples(sequences) in a batch
   size_t numSequences = input.getNumSequences();
   std::vector<int> allIds;
+
+  numSeqs_.clear();
   Info* inlink_info = &info_[inlinks_id];
   inlink_info->idIndex.clear();
   inlink_info->idIndex.push_back(0);  // first idIndex = 0
@@ -634,10 +635,12 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
     // maxSequenceLength_: max number of sentences(subseq) in allsamples
     for (int i = 0; i < maxSequenceLength_; ++i) {
       sequenceStartPositions.push_back(0);            // first element = 0
+      int numSeqs = 0;
       for (size_t j = 0; j < numSubSequences; ++j) {  // for each sentence
         // seqLengthAndStart_[inlinks_id][j]:
         // a 4-tuple including <subseqlen, subseqstart, seqid, subseqid>
         if (std::get<3>(seqLengthAndStart_[inlinks_id][j]) == i) {
+          ++numSeqs;
           // subseqstart: the cpuSubSequenceStartPositions of this subseq
           int subSeqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
           int subSeqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
@@ -650,6 +653,7 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
       }
       inlink_info->idIndex.push_back(allIds.size());
       inlink_info->seqStartPosIndex.push_back(sequenceStartPositions.size());
+      numSeqs_.push_back(numSeqs);
     }
     // inFrameLine create sequenceStartPositions one time
     CHECK_EQ(sequenceStartPositions.size(),
@@ -659,16 +663,19 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
     createSeqPos(sequenceStartPositions, &inlink_info->sequenceStartPositions);
   } else {  // for scatterAgentLayer
     for (int i = 0; i < maxSequenceLength_; ++i) {
+      int numSeqs = 0;
       for (size_t j = 0; j < numSequences; ++j) {
         int seqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
         if (i >= seqLength) {
           break;
         }
+        ++numSeqs;
         int seqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
         allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
                                    : (seqStart + i));
       }
       inlink_info->idIndex.push_back(allIds.size());
+      numSeqs_.push_back(numSeqs);
     }
   }
 
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 4ca545b504f7374f8effd828153c6d4ea05c9ea3..d9901ad81d60ae7c30ac89e3fc34ef6c179b5dd5 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -333,6 +333,10 @@ protected:
   };
   std::vector<Info> info_;
 
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  std::vector<int> numSeqs_;
+
   // each inlinks has a "std::vector<std::tuple<int, int, int, int>>" denotes
   // its sequence info:
   //  if hasSubSeq, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
diff --git a/paddle/gserver/tests/Sequence/dummy.list b/paddle/gserver/tests/Sequence/dummy.list
new file mode 100644
index 0000000000000000000000000000000000000000..0e52665e11298965df5738f69c5bcefcc8bab0f9
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/dummy.list
@@ -0,0 +1 @@
+dummy_file_no_use
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..85a83554c5c3045d144ee0250d2808237eccc9e0
--- /dev/null
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(2)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(2)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index e4727e472d446b48e6001968841bfc178e34ec0c..cb83d79d78cc677d5ffeb77f5693d08da2a51668 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python
-#coding=utf-8
-
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..03eef7a2175880529b2743adc3ca2aaf3622c095
--- /dev/null
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -0,0 +1,75 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        return fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it correctly. Current implementation requires that
+    # all the out links are from sequences. However, it does not report error
+    # when the out links are not sequences.
+    return inner_rnn_output
+
+out = recurrent_group(
+    step=outer_step,
+    input=SubsequenceInput(emb))
+
+value_printer_evaluator(input=out)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..73e7a5935f6c3cf1857d2b952471bb969d86211a
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn.conf
@@ -0,0 +1,57 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    return fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+
+out = recurrent_group(
+    step=step,
+    input=emb)
+
+value_printer_evaluator(input=out)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 35d6ee7f4a402d198dbcd1df7b272dcd65723659..f6989e9a6463a286f5df43e951b2d79e74c3d16a 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
 
+P_DECLARE_int32(seed);
+
 using namespace paddle;  // NOLINT
 using namespace std;  // NOLINT
 class TrainerForTest : public paddle::Trainer {
@@ -68,7 +70,9 @@ void CalCost(const string& conf, const string& dir, real* cost,
   CpuVector vecMomentum(dim);
 
   // vecW needs to be assigned, otherwise the variable is an uncertain value.
-  vecW.zeroMem();
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);
 
   trainer.startTrain();
   for (int i = 0; i < num_passes; ++i) {
@@ -88,15 +92,13 @@ void CalCost(const string& conf, const string& dir, real* cost,
   rmDir(dir.c_str());
 }
 
-TEST(RecurrentGradientMachine, HasSubSequence) {
+void test(const string& conf1, const string& conf2) {
   int num_passes = 5;
   real* cost1 = new real[num_passes];
-  const string conf1 = "gserver/tests/sequence_layer_group.conf";
   const string dir1 = "gserver/tests/t1";
   CalCost(conf1, dir1, cost1, num_passes);
 
   real* cost2 = new real[num_passes];
-  const string conf2 = "gserver/tests/sequence_nest_layer_group.conf";
   const string dir2 = "gserver/tests/t2";
   CalCost(conf2, dir2, cost2, num_passes);
 
@@ -109,6 +111,17 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
   delete[] cost2;
 }
 
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  test("gserver/tests/sequence_layer_group.conf",
+       "gserver/tests/sequence_nest_layer_group.conf");
+}
+
+TEST(RecurrentGradientMachine, rnn) {
+  test("gserver/tests/sequence_rnn.conf",
+       "gserver/tests/sequence_nest_rnn.conf");
+}
+
+
 int main(int argc, char** argv) {
   if (paddle::version::isWithPyDataProvider()) {
     if (!paddle::version::isWithGpu()) {
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 3cab87c700225db0c3e54e60aa8376db14bc2fb1..5474a05b84101ab46e0d1826626dd59399a328a9 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -255,6 +255,15 @@ struct Argument {
 
   /*
    Get Sequence Length, startPositions and max Length according to input
+   1. For sequence data:
+      Each tuple is (seq_length, seq_start, seq_id, seq_id)
+      The tuples are sorted according to seq_length or subseq_length
+      *maxSequenceLength is the maximal sequence length
+
+   2. For subsequence data:
+      Each tuple is (subseq_length, subseq_start, seq_id, subseq_id)
+      The tuples are not sorted. They are in the original order.
+      *maxSequenceLenth is the maximal number of subsequences in each sequence.
    */
   void getSeqLengthAndStart(
       std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,