Further fix the memory for Hierarchical RNN

Sequences should be sorted according to the number of subsequences they have.

Further fix the memory for Hierarchical RNN
Sequences should be sorted according to the number of subsequences they have.
05a97ab5 · xuwei06 · a9d327bd · 05a97ab5 · 05a97ab5 · 05a97ab5
12 changed file
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "hl_matrix_apply.cuh"
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
+#include "hl_device_functions.cuh"
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -434,23 +434,25 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
    }
  }
-  seqLengthAndStart_.clear();
  info_.clear();
  info_.resize(inFrameLines_.size());
-  seqLengthAndStart_.resize(inFrameLines_.size());
+  seqInfos_.clear();
+  seqInfos_.resize(inFrameLines_.size());
  {
    AsyncGpuBlock asyncGpuBlock;
    // if shareInlinkInfo, only calculate info of the first inlink
    // else, calculate info for each inlink
    if (shareInlinkInfo) {
-      input.getSeqLengthAndStart(&seqLengthAndStart_[0], &maxSequenceLength_);
+      input.getSeqInfo(&seqInfos_[0]);
+      maxSequenceLength_ = seqInfos_[0][0].topLevelLength;
      createInFrameInfo(0, input, passType);
    } else {
      for (size_t i = 0; i < inFrameLines_.size(); i++) {
        const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
-        input1.getSeqLengthAndStart(&seqLengthAndStart_[i],
+        input1.getSeqInfo(&seqInfos_[i]);
-                                    &maxSequenceLength_);
+        maxSequenceLength_ = seqInfos_[i][0].topLevelLength;
        createInFrameInfo(i, input1, passType);
      }
    }
@@ -614,7 +616,7 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
 * for all realLayer of inFrameLines one time.
 */
-void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
+void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
                                                 const Argument& input,
                                                 PassType passType) {
  bool hasSubseq = input.hasSubseq();
@@ -622,66 +624,67 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinks_id,
  size_t numSequences = input.getNumSequences();
  std::vector<int> allIds;
+  auto& seqInfo = seqInfos_[inlinkId];
  numSeqs_.clear();
-  Info* inlink_info = &info_[inlinks_id];
+  Info* inlinkInfo = &info_[inlinkId];
-  inlink_info->idIndex.clear();
+  inlinkInfo->idIndex.clear();
-  inlink_info->idIndex.push_back(0);  // first idIndex = 0
+  inlinkInfo->idIndex.push_back(0);  // first idIndex = 0
+  std::vector<int> sequenceStartPositions;
+  const int* subSequenceStartPositions = nullptr;
  if (hasSubseq) {                    // for sequenceScatterAgentLayer
-    // numSubSequences : all sentences within all samples(batch)
+    subSequenceStartPositions =
-    size_t numSubSequences = input.getNumSubSequences();
+        input.subSequenceStartPositions->getData(false);
-    std::vector<int> sequenceStartPositions;
+    inlinkInfo->seqStartPosIndex.clear();
-    inlink_info->seqStartPosIndex.clear();
+    inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-    inlink_info->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
+  }
-    // maxSequenceLength_: max number of sentences(subseq) in allsamples
+  // maxSequenceLength_: max topLevelLength in allsamples
-    for (int i = 0; i < maxSequenceLength_; ++i) {
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    if (hasSubseq) {
      sequenceStartPositions.push_back(0);            // first element = 0
-      int numSeqs = 0;
-      for (size_t j = 0; j < numSubSequences; ++j) {  // for each sentence
-        // seqLengthAndStart_[inlinks_id][j]:
-        // a 4-tuple including <subseqlen, subseqstart, seqid, subseqid>
-        if (std::get<3>(seqLengthAndStart_[inlinks_id][j]) == i) {
-          ++numSeqs;
-          // subseqstart: the cpuSubSequenceStartPositions of this subseq
-          int subSeqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
-          int subSeqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
-          for (int k = subSeqStart; k < subSeqStart + subSeqLength; ++k) {
-            allIds.push_back(k);
-          }
-          sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                           subSeqLength);
-        }
-      }
-      inlink_info->idIndex.push_back(allIds.size());
-      inlink_info->seqStartPosIndex.push_back(sequenceStartPositions.size());
-      numSeqs_.push_back(numSeqs);
    }
-    // inFrameLine create sequenceStartPositions one time
+    int numSeqs = 0;
-    CHECK_EQ(sequenceStartPositions.size(),
+    for (size_t j = 0; j < numSequences; ++j) {
-             maxSequenceLength_ + numSubSequences);
+      int seqLength = seqInfo[j].topLevelLength;
-    CHECK_EQ(inlink_info->seqStartPosIndex.size(),
+      if (i >= seqLength) {
-             static_cast<size_t>(maxSequenceLength_ + 1));
+        break;
-    createSeqPos(sequenceStartPositions, &inlink_info->sequenceStartPositions);
+      }
-  } else {  // for scatterAgentLayer
+      ++numSeqs;
-    for (int i = 0; i < maxSequenceLength_; ++i) {
+      if (hasSubseq) {
-      int numSeqs = 0;
+        int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-      for (size_t j = 0; j < numSequences; ++j) {
+        int subSeqEnd =
-        int seqLength = std::get<0>(seqLengthAndStart_[inlinks_id][j]);
+            subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-        if (i >= seqLength) {
+        for (int k = subSeqStart; k < subSeqEnd; ++k) {
-          break;
+          allIds.push_back(k);
        }
-        ++numSeqs;
+        sequenceStartPositions.push_back(sequenceStartPositions.back() +
-        int seqStart = std::get<1>(seqLengthAndStart_[inlinks_id][j]);
+                                         subSeqEnd - subSeqStart);
+      } else {
+        int seqStart = seqInfo[j].seqStart;
        allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
                                   : (seqStart + i));
      }
-      inlink_info->idIndex.push_back(allIds.size());
-      numSeqs_.push_back(numSeqs);
    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+    numSeqs_.push_back(numSeqs);
+    if (hasSubseq) {
+      inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
+    }
+  }
+  if (hasSubseq) {
+    // inFrameLine create sequenceStartPositions one time
+    CHECK_EQ(sequenceStartPositions.size(),
+             maxSequenceLength_ + input.getNumSubSequences());
+    CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+             static_cast<size_t>(maxSequenceLength_ + 1));
+    createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
  }
  // copy and check scatterId
-  copyScattedId(allIds, &inlink_info->allIds, input.getBatchSize());
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlink_info->idIndex.size(),
+  CHECK_EQ(inlinkInfo->idIndex.size(),
           static_cast<size_t>(maxSequenceLength_ + 1));
 }
@@ -701,7 +704,7 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
    const int* starts = input.sequenceStartPositions->getData(false);
    for (size_t i = 0; i < numSequences; ++i) {
      // memory info adopt info of inlinks[0]
-      int seqId = std::get<2>(seqLengthAndStart_[0][i]);
+      int seqId = seqInfos_[0][i].seqId;
      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
        allIds.push_back(k);
      }
@@ -713,7 +716,7 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
  } else {  // for scatterAgentLayer
    for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(std::get<2>(seqLengthAndStart_[0][i]));
+      allIds.push_back(seqInfos_[0][i].seqId);
    }
  }
  // copy and check scatterId

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -337,11 +337,7 @@ protected:
  // data) or has more than i subsequences (for subsequence data)
  std::vector<int> numSeqs_;
-  // each inlinks has a "std::vector<std::tuple<int, int, int, int>>" denotes
+  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
-  // its sequence info:
-  //  if hasSubSeq, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
-  //  else, tuple of (seqLength, seqStart, seqIndex, seqIndex)
-  std::vector<std::vector<std::tuple<int, int, int, int>>> seqLengthAndStart_;
  // the id of inlink which share info with outlinks
  int targetInfoInlinkId_;

--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Layer.h"
+namespace paddle {
+class PrintLayer : public Layer {
+public:
+  explicit PrintLayer(const LayerConfig& config)
+      : Layer(config) {}
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback) {}
+};
+void PrintLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const auto& argu = getInput(i);
+    const std::string& name = inputLayers_[i]->getName();
+    if (argu.value) {
+      std::ostringstream os;
+      argu.value->print(os);
+      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
+    }
+    if (argu.ids) {
+      std::ostringstream os;
+      argu.ids->print(os, argu.ids->getSize());
+      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
+    }
+    if (auto startPos = argu.sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
+    }
+    if (auto subStartPos = argu.subSequenceStartPositions) {
+      std::ostringstream os;
+      subStartPos->getVector(false)->print(os, subStartPos->getSize());
+      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
+                << os.str();
+    }
+  }
+}
+REGISTER_LAYER(print, PrintLayer);
+}  // namespace paddle
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -42,14 +42,16 @@ def outer_step(x):
        inner_mem = memory(name="inner_rnn_state",
                           size=hidden_dim,
                           boot_layer=outer_mem)
-        return fc_layer(input=[y, inner_mem],
+        out = fc_layer(input=[y, inner_mem],
                        size=hidden_dim,
                        act=TanhActivation(),
                        bias_attr=True,
                        name="inner_rnn_state")
+        return out
    inner_rnn_output = recurrent_group(
        step=inner_step,
+        name="inner",
        input=x)
    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
@@ -60,11 +62,10 @@ def outer_step(x):
    return inner_rnn_output
 out = recurrent_group(
+    name="outer",
    step=outer_step,
    input=SubsequenceInput(emb))
-value_printer_evaluator(input=out)
 rep = last_seq(input=out)
 prob = fc_layer(size=label_dim,
                input=rep,

--- a/paddle/gserver/tests/sequence_rnn.conf
+++ b/paddle/gserver/tests/sequence_rnn.conf
@@ -35,18 +35,18 @@ emb = embedding_layer(input=data, size=word_dim)
 def step(y):
    mem = memory(name="rnn_state", size=hidden_dim)
-    return fc_layer(input=[y, mem],
+    out = fc_layer(input=[y, mem],
                    size=hidden_dim,
                    act=TanhActivation(),
                    bias_attr=True,
                    name="rnn_state")
+    return out
 out = recurrent_group(
+    name="rnn",
    step=step,
    input=emb)
-value_printer_evaluator(input=out)
 rep = last_seq(input=out)
 prob = fc_layer(size=label_dim,
                input=rep,

--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -92,7 +92,7 @@ void CalCost(const string& conf, const string& dir, real* cost,
  rmDir(dir.c_str());
 }
-void test(const string& conf1, const string& conf2) {
+void test(const string& conf1, const string& conf2, double eps) {
  int num_passes = 5;
  real* cost1 = new real[num_passes];
  const string dir1 = "gserver/tests/t1";
@@ -104,8 +104,9 @@ void test(const string& conf1, const string& conf2) {
  for (int i = 0; i < num_passes; i++) {
    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i];
+              << ", cost2=" << cost2[i]
-    ASSERT_NEAR(cost1[i], cost2[i], 1e-3);
+              << ", diff=" << std::abs(cost1[i] - cost2[i]);
+    ASSERT_NEAR(cost1[i], cost2[i], eps);
  }
  delete[] cost1;
  delete[] cost2;
@@ -113,12 +114,14 @@ void test(const string& conf1, const string& conf2) {
 TEST(RecurrentGradientMachine, HasSubSequence) {
  test("gserver/tests/sequence_layer_group.conf",
-       "gserver/tests/sequence_nest_layer_group.conf");
+       "gserver/tests/sequence_nest_layer_group.conf",
+       1e-5);
 }
 TEST(RecurrentGradientMachine, rnn) {
  test("gserver/tests/sequence_rnn.conf",
-       "gserver/tests/sequence_nest_rnn.conf");
+       "gserver/tests/sequence_nest_rnn.conf",
+       0);
 }

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -477,51 +477,34 @@ void Argument::splitByDataId(const std::vector<Argument>& argus,
  }
 }
-void Argument::getSeqLengthAndStart(
+void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
-    std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
-    int* maxSequenceLength) const {
  const int* starts = sequenceStartPositions->getData(false);
-  if (hasSubseq()) {
+  const int* subStarts = hasSubseq()
-    size_t numSubSequences = getNumSubSequences();
+      ? subSequenceStartPositions->getData(false) : nullptr;
-    (*seqLengthAndStart).reserve(numSubSequences);
+  size_t numSequences = getNumSequences();
-    const int* subStarts = subSequenceStartPositions->getData(false);
+  seqInfo->reserve(numSequences);
-    int seqIndex = 0;
+  int subSeqEnd = 0;
-    int subSeqIndex = 0;
+  for (size_t i = 0; i < numSequences; ++i) {
-    *maxSequenceLength = 0;
+    SeqInfo info;
-    for (size_t i = 0; i < numSubSequences; ++i) {
+    info.seqStart = starts[i];
-      if (subStarts[i] == starts[seqIndex]) {
+    info.subLevelLength = starts[i + 1] - starts[i];
-        subSeqIndex = 0;
+    info.seqId = i;
-        (*seqLengthAndStart)
+    if (hasSubseq()) {
-            .push_back(std::make_tuple<int, int, int, int>(
+      info.subSeqStart = subSeqEnd;
-                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
+      while (subStarts[subSeqEnd] < starts[i + 1]) {
-                (int)seqIndex, (int)subSeqIndex));
+        ++subSeqEnd;
-        ++subSeqIndex;
-        ++seqIndex;
-      } else if (subStarts[i] < starts[seqIndex]) {
-        (*seqLengthAndStart)
-            .push_back(std::make_tuple<int, int, int, int>(
-                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
-                (int)seqIndex - 1, (int)subSeqIndex));
-        ++subSeqIndex;
      }
-      // maxSequenceLength_ = 1 + max(subSeqIndex) in each Seq.
+      info.topLevelLength = subSeqEnd - info.subSeqStart;
-      if (*maxSequenceLength < std::get<3>((*seqLengthAndStart)[i]))
+    } else {
-        *maxSequenceLength = std::get<3>((*seqLengthAndStart)[i]);
+      info.topLevelLength = info.subLevelLength;
-    }
+      info.subSeqStart = 0;  // not used
-    *maxSequenceLength += 1;
-  } else {
-    size_t numSequences = getNumSequences();
-    (*seqLengthAndStart).reserve(numSequences);
-    for (size_t i = 0; i < numSequences; ++i) {
-      (*seqLengthAndStart)
-          .push_back(std::make_tuple<int, int, int, int>(
-              starts[i + 1] - starts[i], (int)starts[i], (int)i, (int)i));
    }
-    std::sort((*seqLengthAndStart).begin(), (*seqLengthAndStart).end(),
+    seqInfo->push_back(info);
-              std::greater<std::tuple<int, int, int, int>>());
-    *maxSequenceLength = std::get<0>((*seqLengthAndStart)[0]);
  }
+  std::sort(seqInfo->begin(), seqInfo->end(),
+            [](const SeqInfo& a, const SeqInfo& b) {
+              return a.topLevelLength > b.topLevelLength;
+            });
 }
 void Argument::checkSubset() const {

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -253,21 +253,29 @@ struct Argument {
  static void splitByDataId(const std::vector<Argument>& argus,
                            std::vector<std::vector<Argument>>* arguGroups);
+  struct SeqInfo {
+    // Equal to sequence length for sequence data
+    // Equal to number of subsequences for subsequence data
+    int topLevelLength;
+    int seqStart;
+    int seqId;
+    // Equal to topLevelLength for sequence data
+    // Equal to sum of the length of subsequences for subsequence data
+    int subLevelLength;
+    // Only used for subsequence data, start position of this sequence
+    // is subSequenceStartPositions, i.e.
+    // subSequenceStartPositions[subSeqStart] == seqStart
+    int subSeqStart;
+  };
  /*
-   Get Sequence Length, startPositions and max Length according to input
+    Get SeqInfo for each sequence of this argument
-   1. For sequence data:
+    Elements in *seqInfo are sorted by topLevelLength in descending order
-      Each tuple is (seq_length, seq_start, seq_id, seq_id)
+  */
-      The tuples are sorted according to seq_length or subseq_length
+  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
-      *maxSequenceLength is the maximal sequence length
-   2. For subsequence data:
-      Each tuple is (subseq_length, subseq_start, seq_id, subseq_id)
-      The tuples are not sorted. They are in the original order.
-      *maxSequenceLenth is the maximal number of subsequences in each sequence.
-   */
-  void getSeqLengthAndStart(
-      std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
-      int* maxSequenceLength) const;
  /*
   Check Whether sequenceStartPositions is subset of
   subSequenceStartPositions.

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1408,6 +1408,14 @@ class SelectiveFCLayer(LayerBase):
                input_index, psize, dims, sparse, format)
        self.create_bias_parameter(bias, self.config.size)
+@config_layer('print')
+class PrintLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs):
+        super(PrintLayer, self).__init__(name, 'print', 0, inputs)
 @config_layer('data')
 class DataLayer(LayerBase):
    def __init__(

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -52,7 +52,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
           'cross_entropy_with_selfnorm', 'cross_entropy',
           'multi_binary_label_cross_entropy',
           'rank_cost', 'lambda_cost', 'huber_cost',
-           'block_expand_layer', 'out_prod_layer',
+           'block_expand_layer', 'out_prod_layer', 'print_layer'
           ]
@@ -108,6 +108,8 @@ class LayerType(object):
    LINEAR_COMBINATION_LAYER = "convex_comb"
    BLOCK_EXPAND = "blockexpand"
+    PRINT_LAYER = "print"
    CTC_LAYER = "ctc"
    CRF_LAYER = "crf"
    CRF_DECODING_LAYER = "crf_decoding"
@@ -729,6 +731,19 @@ def fc_layer(input, size, act=None, name=None,
    return LayerOutput(name, LayerType.FC_LAYER, input, activation=act,
                       size=size)
+@wrap_name_default("print")
+def print_layer(input, name=None):
+    """
+    Print the output value of input layers. This layer is useful for debugging.
+    """
+    assert isinstance(input, list)
+    Layer(
+        name=name,
+        type=LayerType.PRINT_LAYER,
+        inputs=[l.name for l in input],
+    )
+    return LayerOutput(name, LayerType.PRINT_LAYER, input)
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)

--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -34,6 +34,8 @@ out = fc_layer(input=[cos1, cos3, linear_comb, z],
               size=num_classes,
               act=SoftmaxActivation())
+print_layer(input=[out])
 outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
 # for ctc