diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h index baa58ca2d7a6970f0d2f3ef6f8609404c82efa30..adf7ab4ae47106689e10e7fd41bdf2253111df0d 100644 --- a/paddle/gserver/layers/MaxLayer.h +++ b/paddle/gserver/layers/MaxLayer.h @@ -26,6 +26,11 @@ namespace paddle { * If SequenceLevel = kNonSeq: * Output: output size is the number of input sequences (NOT input instances) * output[i] = max_{for each instance in this sequence}{input[i]} + * If stride_ > 0: + * Output: a shorten sequence. The operation of getting max instance of a + * sequence is independently performed on every slice of the input + * sequence, which is obtained by sliding a window with the window + * size set to stride_. * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: output size is the number of input sub-sequences diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp index 944c7051668dccf39dd2ace14986d43c8a14e452..8127cbf09c2e2e63176703fd6db5564ae067b49d 100644 --- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp +++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp @@ -73,8 +73,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap, void SequenceLastInstanceLayer::forward(PassType passType) { SequencePoolLayer::forward(passType); - auto starts = (stride_ > 0) ? stridePositions_->getData() - : startPositions_->getData(false); + auto starts = startPositions_->getData(false); MatrixPtr inputValue = getInputValue(0); MatrixPtr outputValue = getOutputValue(); diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp index 4179a9e7e0cb58fcb49bff712e62b9f3fea373bd..2a693b110a562ce3938643c919bfb1a4d3cd1f80 100644 --- a/paddle/gserver/layers/SequencePoolLayer.cpp +++ b/paddle/gserver/layers/SequencePoolLayer.cpp @@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) { if (stride_ > 0) { CHECK_EQ(input.hasSubseq(), 0UL) << "sequence stride pooling is invalid for hasSubseq now"; - output_.poolSequenceWithStride( - input, stride_, &stridePositions_, reversed_); - newBatchSize_ = stridePositions_->getSize() - 1; + output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_); + newBatchSize_ = startPositions_->getSize() - 1; } resetOutput(newBatchSize_, dim); diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h index 293d1bf27823ffb0ebddba95461883d646f159ae..058627def8af2439c84a31d7df6aa11c243399b7 100644 --- a/paddle/gserver/layers/SequencePoolLayer.h +++ b/paddle/gserver/layers/SequencePoolLayer.h @@ -47,8 +47,6 @@ protected: size_t newBatchSize_; ICpuGpuVectorPtr startPositions_; int stride_; - // Store the start position of each window. - IVectorPtr stridePositions_; // Whether the input sequence is reversed or not. bool reversed_ = false; diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 297756025bcad79d49ec321414ed2e91f1c0758a..ed067e7c3a1c5e15a062a77c7ae5d9b90d3edcd3 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq, TEST(Layer, MaxLayer) { testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq - testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq - testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq + testDegradeLayer(false, + "max", + "non-seq", + 5); // seq max to a shorten seq, stride window = 5 + testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq + testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq } TEST(Layer, SequenceLastInstanceLayer) { @@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) { TEST(Layer, AverageLayer) { testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq + testDegradeLayer(false, + "max", + "non-seq", + 5); // seq average to a shorten seq, stride window = 5 testDegradeLayer( true, "average", "non-seq", -1); // hasSubseq average to non-seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 5beced3bb5a1050078f88dfd4350a2df71d27f35..ef72b973c1a465a8ac03cae1070429160eac0ac1 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) { void Argument::poolSequenceWithStride(const Argument& input, size_t stride, - IVectorPtr* stridePostions, + ICpuGpuVectorPtr* stridePostions, bool reversed) { // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5, // then sequenceStartPositions = [0, 2, 3, 4, 7]. @@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input, stridePos.emplace_back(starts[numSequences]); int size = stridePos.size(); CHECK_EQ(size - 1, tgtBuf[numSequences]); - IVector::resizeOrCreate(*stridePostions, size, false); - (*stridePostions)->copyFrom(stridePos.data(), size); + ICpuGpuVector::resizeOrCreate(*stridePostions, size, false); + (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size); } void Argument::getValueString( diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 09bd633616730dc9475edc596128166f4f70b0cd..0ccdef802e71b659788cfd24f28ebe43e1917db1 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -299,7 +299,7 @@ struct Argument { */ void poolSequenceWithStride(const Argument& input, size_t stride, - IVectorPtr* stridePositions, + ICpuGpuVectorPtr* stridePositions, bool reversed = false); /** * @brief getValueString will return the argument's output in string. There diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp index 98ab013548734059060eb06ce1a7cec23dbf1b72..19df6ea95745609a4eb7487d422e61d2f0b269cc 100644 --- a/paddle/parameter/tests/test_argument.cpp +++ b/paddle/parameter/tests/test_argument.cpp @@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) { int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30}; for (auto reversed : {false, true}) { - IVectorPtr stridePositions; + ICpuGpuVectorPtr stridePositions; output.poolSequenceWithStride( input, 5 /* stride */, &stridePositions, reversed); @@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) { CHECK_EQ(stridePositions->getSize(), 8UL); auto result = reversed ? strideResultReversed : strideResult; for (int i = 0; i < 8; i++) { - CHECK_EQ(stridePositions->getData()[i], result[i]); + CHECK_EQ(stridePositions->getData(false)[i], result[i]); } } } diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index b7418101d83fde1b91781d3a42b056cc7708cba9..5ca7df74765517773d7825a361edd7b561742d81 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2420,10 +2420,14 @@ class MaxLayer(LayerBase): trans_type='non-seq', bias=False, output_max_index=None, + stride=-1, **xargs): super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs) config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input') + if trans_type == 'seq': + config_assert(stride == -1, 'subseq does not support stride window') self.config.trans_type = trans_type + self.config.seq_pool_stride = stride for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) self.set_layer_size(input_layer.size) @@ -2685,11 +2689,15 @@ class AverageLayer(LayerBase): average_strategy='average', trans_type='non-seq', bias=False, + stride=-1, **xargs): super(AverageLayer, self).__init__( name, 'average', 0, inputs=inputs, **xargs) self.config.average_strategy = average_strategy + if trans_type == 'seq': + config_assert(stride == -1, 'subseq does not support stride window') self.config.trans_type = trans_type + self.config.seq_pool_stride = stride config_assert(len(inputs) == 1, 'AverageLayer must have 1 input') for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index a601d5c84ad222785e68b9fa81c51b1e120b4f29..5e8bf4b2034b7aa2a75b9a3016048b800f817936 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1090,10 +1090,16 @@ def pooling_layer(input, name=None, bias_attr=None, agg_level=AggregateLevel.TO_NO_SEQUENCE, + stride=-1, layer_attr=None): """ Pooling layer for sequence inputs, not used for Image. + If stride > 0, this layer slides a window whose size is determined by stride, + and return the pooling value of the window as the output. Thus, a long sequence + will be shorten. Note that for sequence with sub-sequence, the default value + of stride is -1. + The example usage is: .. code-block:: python @@ -1112,6 +1118,8 @@ def pooling_layer(input, :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling, SumPooling, SquareRootNPooling. :type pooling_type: BasePoolingType|None + :param stride: window size. + :type stride: Int :param bias_attr: Bias parameter attribute. False if no bias. :type bias_attr: ParameterAttribute|None|False :param layer_attr: The Extra Attributes for layer, such as dropout. @@ -1129,12 +1137,16 @@ def pooling_layer(input, extra_dict['output_max_index'] = pooling_type.output_max_index extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr)) + if agg_level == AggregateLevel.TO_SEQUENCE: + assert stride == -1 + Layer( name=name, type=pooling_type.name, inputs=[Input(input.name)], bias=ParamAttr.to_bias(bias_attr), trans_type=agg_level, + stride=stride, **extra_dict) return LayerOutput( diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr index 5a217f5544a8a3b4704b158dfeb92f747b7bd94b..8989561df04a60c906c06432fd857227a3814194 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr @@ -14,6 +14,7 @@ layers { input_layer_name: "dat_in" } trans_type: "seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_1__" @@ -24,6 +25,7 @@ layers { input_layer_name: "dat_in" } trans_type: "non-seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_2__" @@ -35,6 +37,7 @@ layers { } average_strategy: "average" trans_type: "seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_3__" @@ -46,6 +49,7 @@ layers { } average_strategy: "average" trans_type: "non-seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_4__" @@ -57,6 +61,7 @@ layers { } average_strategy: "sum" trans_type: "seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_5__" @@ -68,6 +73,7 @@ layers { } average_strategy: "sum" trans_type: "non-seq" + seq_pool_stride: -1 } layers { name: "__seq_pooling_6__" @@ -77,8 +83,44 @@ layers { inputs { input_layer_name: "dat_in" } + trans_type: "non-seq" + seq_pool_stride: 5 +} +layers { + name: "__seq_pooling_7__" + type: "average" + size: 100 + active_type: "" + inputs { + input_layer_name: "dat_in" + } + average_strategy: "average" + trans_type: "non-seq" + seq_pool_stride: 5 +} +layers { + name: "__seq_pooling_8__" + type: "average" + size: 100 + active_type: "" + inputs { + input_layer_name: "dat_in" + } + average_strategy: "sum" + trans_type: "non-seq" + seq_pool_stride: 5 +} +layers { + name: "__seq_pooling_9__" + type: "max" + size: 100 + active_type: "" + inputs { + input_layer_name: "dat_in" + } output_max_index: true trans_type: "non-seq" + seq_pool_stride: -1 } input_layer_names: "dat_in" output_layer_names: "__seq_pooling_0__" @@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__" output_layer_names: "__seq_pooling_4__" output_layer_names: "__seq_pooling_5__" output_layer_names: "__seq_pooling_6__" +output_layer_names: "__seq_pooling_7__" +output_layer_names: "__seq_pooling_8__" +output_layer_names: "__seq_pooling_9__" sub_models { name: "root" layer_names: "dat_in" @@ -98,6 +143,9 @@ sub_models { layer_names: "__seq_pooling_4__" layer_names: "__seq_pooling_5__" layer_names: "__seq_pooling_6__" + layer_names: "__seq_pooling_7__" + layer_names: "__seq_pooling_8__" + layer_names: "__seq_pooling_9__" input_layer_names: "dat_in" output_layer_names: "__seq_pooling_0__" output_layer_names: "__seq_pooling_1__" @@ -106,6 +154,9 @@ sub_models { output_layer_names: "__seq_pooling_4__" output_layer_names: "__seq_pooling_5__" output_layer_names: "__seq_pooling_6__" + output_layer_names: "__seq_pooling_7__" + output_layer_names: "__seq_pooling_8__" + output_layer_names: "__seq_pooling_9__" is_recurrent_layer_group: false } diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py index 3c49eb56c1363a6a3f365fe56e16a8b484c8a004..3c205eabd80492a68383fdbecd14a7d6db3e16eb 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py @@ -14,6 +14,14 @@ for pt in POOL_TYPE: for al in AGG_LEVEL: opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt())) +for pt in POOL_TYPE: + opts.append( + pooling_layer( + input=din, + agg_level=AggregateLevel.TO_NO_SEQUENCE, + pooling_type=pt(), + stride=5)) + opts.append( pooling_layer( input=din, pooling_type=MaxPooling(output_max_index=True)))