Correctly handle memory in RecurrentGradientMachine for hirarchical RNN

Change-Id: I8e0a8ea6fc2760652d9c76440a539c90860062d3

Correctly handle memory in RecurrentGradientMachine for hirarchical RNN
Change-Id: I8e0a8ea6fc2760652d9c76440a539c90860062d3
9a9de924 · xuwei06 · Yu Yang · 699d5f26 · 9a9de924 · 9a9de924
9 changed file
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -519,7 +519,6 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
      gatherAgent->addRealLayer(outFrameLine.frames[i]);
    }
    // connect memory links
    // Adopt info_[0].idIndex because seq which has_subseq=True
    // doesn't support Memory with !hasSubseq bootlayer;
@@ -529,7 +528,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
      NeuralNetwork::connect(
          memoryFrameLine.agents[i],
          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          idSize /*height of agent*/);
+          numSeqs_[i] /*height of agent*/);
    }
  }
@@ -622,6 +621,8 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
  // numSequences: # samples(sequences) in a batch
  size_t numSequences = input.getNumSequences();
  std::vector<int> allIds;
+  numSeqs_.clear();
  Info* inlink_info = &info_[inlinks_id];
  inlink_info->idIndex.clear();
  inlink_info->idIndex.push_back(0);  // first idIndex = 0
@@ -634,10 +635,12 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
    // maxSequenceLength_: max number of sentences(subseq) in allsamples
    for (int i = 0; i < maxSequenceLength_; ++i) {
      sequenceStartPositions.push_back(0);            // first element = 0
+      int numSeqs = 0;
      for (size_t j = 0; j < numSubSequences; ++j) {  // for each sentence
        // seqLengthAndStart_[inlinks_id][j]:
        // a 4-tuple including <subseqlen, subseqstart, seqid, subseqid>
        if (std::get<3>(seqLengthAndStart_[inlinks_id][j]) == i) {
+          ++numSeqs;
          // subseqstart: the cpuSubSequenceStartPositions of this subseq
          int subSeqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
          int subSeqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
@@ -650,6 +653,7 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
      }
      inlink_info->idIndex.push_back(allIds.size());
      inlink_info->seqStartPosIndex.push_back(sequenceStartPositions.size());
+      numSeqs_.push_back(numSeqs);
    }
    // inFrameLine create sequenceStartPositions one time
    CHECK_EQ(sequenceStartPositions.size(),
@@ -659,16 +663,19 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
    createSeqPos(sequenceStartPositions, &inlink_info->sequenceStartPositions);
  } else {  // for scatterAgentLayer
    for (int i = 0; i < maxSequenceLength_; ++i) {
+      int numSeqs = 0;
      for (size_t j = 0; j < numSequences; ++j) {
        int seqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
        if (i >= seqLength) {
          break;
        }
+        ++numSeqs;
        int seqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
        allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
                                   : (seqStart + i));
      }
      inlink_info->idIndex.push_back(allIds.size());
+      numSeqs_.push_back(numSeqs);
    }
  }

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -333,6 +333,10 @@ protected:
  };
  std::vector<Info> info_;
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  std::vector<int> numSeqs_;
  // each inlinks has a "std::vector<std::tuple<int, int, int, int>>" denotes
  // its sequence info:
  //  if hasSubSeq, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)

--- a/paddle/gserver/tests/Sequence/dummy.list
+++ b/paddle/gserver/tests/Sequence/dummy.list
+dummy_file_no_use
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer.PyDataProvider2 import *
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(2)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(2)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
-#!/usr/bin/env python
-#coding=utf-8
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+data = data_layer(name="word", size=dict_dim)
+emb = embedding_layer(input=data, size=word_dim)
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        return fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it correctly. Current implementation requires that
+    # all the out links are from sequences. However, it does not report error
+    # when the out links are not sequences.
+    return inner_rnn_output
+out = recurrent_group(
+    step=outer_step,
+    input=SubsequenceInput(emb))
+value_printer_evaluator(input=out)
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ b/paddle/gserver/tests/sequence_rnn.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+data = data_layer(name="word", size=dict_dim)
+emb = embedding_layer(input=data, size=word_dim)
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    return fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+out = recurrent_group(
+    step=step,
+    input=emb)
+value_printer_evaluator(input=out)
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
+P_DECLARE_int32(seed);
 using namespace paddle;  // NOLINT
 using namespace std;  // NOLINT
 class TrainerForTest : public paddle::Trainer {
@@ -68,7 +70,9 @@ void CalCost(const string& conf, const string& dir, real* cost,
  CpuVector vecMomentum(dim);
  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-  vecW.zeroMem();
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);
  trainer.startTrain();
  for (int i = 0; i < num_passes; ++i) {
@@ -88,15 +92,13 @@ void CalCost(const string& conf, const string& dir, real* cost,
  rmDir(dir.c_str());
 }
-TEST(RecurrentGradientMachine, HasSubSequence) {
+void test(const string& conf1, const string& conf2) {
  int num_passes = 5;
  real* cost1 = new real[num_passes];
-  const string conf1 = "gserver/tests/sequence_layer_group.conf";
  const string dir1 = "gserver/tests/t1";
  CalCost(conf1, dir1, cost1, num_passes);
  real* cost2 = new real[num_passes];
-  const string conf2 = "gserver/tests/sequence_nest_layer_group.conf";
  const string dir2 = "gserver/tests/t2";
  CalCost(conf2, dir2, cost2, num_passes);
@@ -109,6 +111,17 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
  delete[] cost2;
 }
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  test("gserver/tests/sequence_layer_group.conf",
+       "gserver/tests/sequence_nest_layer_group.conf");
+}
+TEST(RecurrentGradientMachine, rnn) {
+  test("gserver/tests/sequence_rnn.conf",
+       "gserver/tests/sequence_nest_rnn.conf");
+}
 int main(int argc, char** argv) {
  if (paddle::version::isWithPyDataProvider()) {
    if (!paddle::version::isWithGpu()) {

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -255,6 +255,15 @@ struct Argument {
  /*
   Get Sequence Length, startPositions and max Length according to input
+   1. For sequence data:
+      Each tuple is (seq_length, seq_start, seq_id, seq_id)
+      The tuples are sorted according to seq_length or subseq_length
+      *maxSequenceLength is the maximal sequence length
+   2. For subsequence data:
+      Each tuple is (subseq_length, subseq_start, seq_id, subseq_id)
+      The tuples are not sorted. They are in the original order.
+      *maxSequenceLenth is the maximal number of subsequences in each sequence.
   */
  void getSeqLengthAndStart(
      std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,