Merge pull request #2701 from luotao1/stride

stride pooling for max and average layer

Merge pull request #2701 from luotao1/stride
stride pooling for max and average layer
7f380c1b · Tao Luo · GitHub · 98378968 · e7b071f3 · 7f380c1b
13 changed file
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -25,6 +25,10 @@ namespace paddle {
 * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences

--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -26,6 +26,10 @@ namespace paddle {
 * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences

--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -26,10 +26,9 @@ namespace paddle {
 * If SequenceLevel = kNonseq:
 *   Output: a sequence containing only the last instance of the input sequence
 *   If stride_ > 0:
- *      Output: a shorten sequence. The operation of getting last instance of a
- *              sequence is independently performed on every slice of the input
- *              sequence, which is obtained by sliding a window with the window
- *              size set to stride_.
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and getting last instance
+ *              operation is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *   Check input sequence must has sub-sequence
 *   Output: a sequence containing only the last instance of each sub-sequence
@@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
  SequencePoolLayer::forward(passType);

-  auto starts = (stride_ > 0) ? stridePositions_->getData()
-                              : startPositions_->getData(false);
+  auto starts = startPositions_->getData(false);
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();


--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
  if (stride_ > 0) {
    CHECK_EQ(input.hasSubseq(), 0UL)
        << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(
-        input, stride_, &stridePositions_, reversed_);
-    newBatchSize_ = stridePositions_->getSize() - 1;
+    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
+    newBatchSize_ = startPositions_->getSize() - 1;
  }

  resetOutput(newBatchSize_, dim);

--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -28,8 +28,9 @@ namespace paddle {
 * sequence}{input[i]}
 *    If stride_ > 0:
 *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence, pooling is performed upon a small local
- *                area
+ *        Output: a shorten sequence. Stride is the step size by which we slide
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
@@ -47,8 +48,6 @@ protected:
  size_t newBatchSize_;
  ICpuGpuVectorPtr startPositions_;
  int stride_;
-  // Store the start position of each window.
-  IVectorPtr stridePositions_;
  // Whether the input sequence is reversed or not.
  bool reversed_ = false;


--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq,

 TEST(Layer, MaxLayer) {
  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
 }

 TEST(Layer, SequenceLastInstanceLayer) {
@@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) {

 TEST(Layer, AverageLayer) {
  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(false,
+                   "average",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
  testDegradeLayer(
      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) {

 void Argument::poolSequenceWithStride(const Argument& input,
                                      size_t stride,
-                                      IVectorPtr* stridePostions,
+                                      ICpuGpuVectorPtr* stridePostions,
                                      bool reversed) {
  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
  // then sequenceStartPositions = [0, 2, 3, 4, 7].
@@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input,
  stridePos.emplace_back(starts[numSequences]);
  int size = stridePos.size();
  CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  IVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->copyFrom(stridePos.data(), size);
+  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
 }

 void Argument::getValueString(

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -299,7 +299,7 @@ struct Argument {
   */
  void poolSequenceWithStride(const Argument& input,
                              size_t stride,
-                              IVectorPtr* stridePositions,
+                              ICpuGpuVectorPtr* stridePositions,
                              bool reversed = false);
  /**
   * @brief getValueString will return the argument's output in string. There

--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) {
  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};

  for (auto reversed : {false, true}) {
-    IVectorPtr stridePositions;
+    ICpuGpuVectorPtr stridePositions;
    output.poolSequenceWithStride(
        input, 5 /* stride */, &stridePositions, reversed);

@@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) {
    CHECK_EQ(stridePositions->getSize(), 8UL);
    auto result = reversed ? strideResultReversed : strideResult;
    for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData()[i], result[i]);
+      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
    }
  }
 }

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2466,10 +2466,14 @@ class MaxLayer(LayerBase):
                 trans_type='non-seq',
                 bias=False,
                 output_max_index=None,
+                 stride=-1,
                 **xargs):
        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
        config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
        self.config.trans_type = trans_type
+        self.config.seq_pool_stride = stride
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
            self.set_layer_size(input_layer.size)
@@ -2731,11 +2735,15 @@ class AverageLayer(LayerBase):
                 average_strategy='average',
                 trans_type='non-seq',
                 bias=False,
+                 stride=-1,
                 **xargs):
        super(AverageLayer, self).__init__(
            name, 'average', 0, inputs=inputs, **xargs)
        self.config.average_strategy = average_strategy
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
        self.config.trans_type = trans_type
+        self.config.seq_pool_stride = stride
        config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1246,10 +1246,19 @@ def pooling_layer(input,
                  name=None,
                  bias_attr=None,
                  agg_level=AggregateLevel.TO_NO_SEQUENCE,
+                  stride=-1,
                  layer_attr=None):
    """
    Pooling layer for sequence inputs, not used for Image.

+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the pooling value of the window as the output. Thus, a long sequence
+    will be shorten. 
+    
+    The parameter stride specifies the intervals at which to apply the pooling 
+    operation. Note that for sequence with sub-sequence, the default value
+    of stride is -1.
+
    The example usage is:

    .. code-block:: python
@@ -1268,6 +1277,8 @@ def pooling_layer(input,
    :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
                         SumPooling, SquareRootNPooling.
    :type pooling_type: BasePoolingType|None
+    :param stride: The step size between successive pooling regions.
+    :type stride: Int
    :param bias_attr: Bias parameter attribute. False if no bias.
    :type bias_attr: ParameterAttribute|None|False
    :param layer_attr: The Extra Attributes for layer, such as dropout.
@@ -1285,12 +1296,16 @@ def pooling_layer(input,
        extra_dict['output_max_index'] = pooling_type.output_max_index
    extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))

+    if agg_level == AggregateLevel.TO_SEQUENCE:
+        assert stride == -1
+
    Layer(
        name=name,
        type=pooling_type.name,
        inputs=[Input(input.name)],
        bias=ParamAttr.to_bias(bias_attr),
        trans_type=agg_level,
+        stride=stride,
        **extra_dict)

    return LayerOutput(
@@ -1552,7 +1567,7 @@ def last_seq(input,
    :type name: basestring
    :param input: Input layer name.
    :type input: LayerOutput
-    :param stride: window size.
+    :param stride: The step size between successive pooling regions.
    :type stride: Int
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.
@@ -1608,7 +1623,7 @@ def first_seq(input,
    :type name: basestring
    :param input: Input layer name.
    :type input: LayerOutput
-    :param stride: window size.
+    :param stride: The step size between successive pooling regions.
    :type stride: Int
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -14,6 +14,7 @@ layers {
    input_layer_name: "dat_in"
  }
  trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_1__"
@@ -24,6 +25,7 @@ layers {
    input_layer_name: "dat_in"
  }
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_2__"
@@ -35,6 +37,7 @@ layers {
  }
  average_strategy: "average"
  trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_3__"
@@ -46,6 +49,7 @@ layers {
  }
  average_strategy: "average"
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_4__"
@@ -57,6 +61,7 @@ layers {
  }
  average_strategy: "sum"
  trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_5__"
@@ -68,6 +73,7 @@ layers {
  }
  average_strategy: "sum"
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_6__"
@@ -77,8 +83,44 @@ layers {
  inputs {
    input_layer_name: "dat_in"
  }
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_7__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_8__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_9__"
+  type: "max"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
  output_max_index: true
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 input_layer_names: "dat_in"
 output_layer_names: "__seq_pooling_0__"
@@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__"
 output_layer_names: "__seq_pooling_4__"
 output_layer_names: "__seq_pooling_5__"
 output_layer_names: "__seq_pooling_6__"
+output_layer_names: "__seq_pooling_7__"
+output_layer_names: "__seq_pooling_8__"
+output_layer_names: "__seq_pooling_9__"
 sub_models {
  name: "root"
  layer_names: "dat_in"
@@ -98,6 +143,9 @@ sub_models {
  layer_names: "__seq_pooling_4__"
  layer_names: "__seq_pooling_5__"
  layer_names: "__seq_pooling_6__"
+  layer_names: "__seq_pooling_7__"
+  layer_names: "__seq_pooling_8__"
+  layer_names: "__seq_pooling_9__"
  input_layer_names: "dat_in"
  output_layer_names: "__seq_pooling_0__"
  output_layer_names: "__seq_pooling_1__"
@@ -106,6 +154,9 @@ sub_models {
  output_layer_names: "__seq_pooling_4__"
  output_layer_names: "__seq_pooling_5__"
  output_layer_names: "__seq_pooling_6__"
+  output_layer_names: "__seq_pooling_7__"
+  output_layer_names: "__seq_pooling_8__"
+  output_layer_names: "__seq_pooling_9__"
  is_recurrent_layer_group: false
 }

--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -14,6 +14,14 @@ for pt in POOL_TYPE:
    for al in AGG_LEVEL:
        opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))

+for pt in POOL_TYPE:
+    opts.append(
+        pooling_layer(
+            input=din,
+            agg_level=AggregateLevel.TO_NO_SEQUENCE,
+            pooling_type=pt(),
+            stride=5))
+
 opts.append(
    pooling_layer(
        input=din, pooling_type=MaxPooling(output_max_index=True)))