From 458b16f42a03fd68af4da05bb93fbc6bf2a75f9e Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 23 Oct 2018 11:41:19 +0200
Subject: [PATCH] Rebase of seqpool-max optimization

test=develop

- Added rough profiling

- Profiled maxpool itself

- First draft of max seqpool optimization (is_test added)

- Added unit tests to seqpool

- Cosmetic fixes

- Fix to UT of Seq pool

Disabled grad checking for sequence max pool when is_test is set to True

-Cosmetic fix to comment

test=develop

- Fix to GPU build

test=develop

- yet another GPU fix for sequence max pool

- Fix to comment

test=develop

- Change to API of sequence_pool

test=develop

- Yet another API spec change

test=develop
---
 paddle/fluid/API.spec                         |  2 +-
 .../fluid/operators/math/sequence_pooling.cc  | 48 +++++++++++++++++--
 .../fluid/operators/math/sequence_pooling.cu  |  2 +-
 .../fluid/operators/math/sequence_pooling.h   |  2 +-
 paddle/fluid/operators/sequence_pool_op.cc    |  1 +
 paddle/fluid/operators/sequence_pool_op.h     | 17 ++++---
 python/paddle/fluid/layers/nn.py              |  6 ++-
 .../fluid/tests/unittests/test_seq_pool.py    | 14 ++++++
 8 files changed, 77 insertions(+), 15 deletions(-)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 2b8b82e74..e0707fdc3 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 7be8539a7..6d491dbf1 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename T>
+template <typename T, bool is_test>
 class MaxSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
     }
   }
 };
+// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
+// index buffer
+template <typename T>
+class MaxSeqPoolFunctor<T, true> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
 
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
+                  dim * sizeof(T));
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+          }
+        }
+      }
+    }
+  }
+};
 template <typename T>
 class MaxSeqPoolGradFunctor {
  public:
@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
   /* max pool has index output */
   void operator()(const platform::CPUDeviceContext& context,
                   const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                   framework::Tensor* index = nullptr) {
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<T> max_pool;
-      max_pool(context, input, output, index);
+      if (is_test) {
+        math::MaxSeqPoolFunctor<T, true> max_pool;
+        max_pool(context, input, output, index);
+      } else {
+        math::MaxSeqPoolFunctor<T, false> max_pool;
+        max_pool(context, input, output, index);
+      }
       return;
     }
     if (pooltype == "LAST") {
@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       last_pool(context, input, output);
       return;
     }
+
     if (pooltype == "FIRST") {
       math::FirstSeqPoolFunctor<T> first_pool;
       first_pool(context, input, output);
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index a92aef805..0015fafbc 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                   framework::Tensor* index = nullptr) {
     auto& lod = input.lod()[0];
     const size_t item_dim = output->numel() / output->dims()[0];
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index 8dcbee65d..a1046ea21 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -28,7 +28,7 @@ class SequencePoolFunctor {
   /* max pool has index output */
   void operator()(const DeviceContext& context, const std::string pooltype,
                   const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index = nullptr);
+                  bool is_test = false, framework::Tensor* index = nullptr);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 15d3f064e..217bb1610 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor<int>) This tensor is used for the sequence max-pooling "
               "to record the max indexes.")
         .AsIntermediate();
+    AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<std::string>(
         "pooltype",
         "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
index 2aa20792f..f2e4a55de 100644
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
-    Tensor* index = nullptr;
-    if (pooltype == "MAX") {
-      index = context.Output<Tensor>("MaxIndex");
-    }
 
     auto dims = in->dims();
     auto lod = in->lod();
@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     dims[0] = lod[0].size() - 1;
     out->Resize({dims});
     out->mutable_data<T>(context.GetPlace());
-    if (pooltype == "MAX") {
+    Tensor* index = nullptr;
+
+    const bool is_test = context.Attr<bool>("is_test");
+
+    // Do not create index buffer for inference (is_test) mode
+    // TODO(jczaja): Skip index buffer creation for other devices eg. GPU
+    if (pooltype == "MAX" &&
+        (is_test == false ||
+         platform::is_cpu_place(context.GetPlace()) == false)) {
+      index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
     }
     math::SequencePoolFunctor<DeviceContext, T> pool;
     pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
-         index);
+         is_test, index);
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4bfa89d9f..b9d1b7c28 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1823,7 +1823,7 @@ def conv3d(input,
     return helper.append_activation(pre_act)
 
 
-def sequence_pool(input, pool_type):
+def sequence_pool(input, pool_type, is_test=False):
     """
     This function add the operator for sequence pooling.
     It pools features of all time-steps of each instance, and is applied
@@ -1860,6 +1860,7 @@ def sequence_pool(input, pool_type):
         input(variable): The input variable which is a LoDTensor.
         pool_type (string): The pooling type of sequence_pool.
             It supports average, sum, sqrt and max.
+        is_test(bool, Default False): Used distinguish training from scoring mode.
 
     Returns:
         The sequence pooling variable which is a Tensor.
@@ -1887,7 +1888,8 @@ def sequence_pool(input, pool_type):
         inputs={"X": input},
         outputs={"Out": pool_out,
                  "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
+        attrs={"pooltype": pool_type.upper(),
+               "is_test": is_test})
 
     # when pool_type is max, variable max_index is initialized,
     # so we stop the gradient explicitly here
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 641eb03a5..a80ad5b07 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
+class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "MAX", 'is_test': True}
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+
+    def test_check_grad(self):
+        """Grad computation does not apply to Sequence MAX 
+            Pool executed when is_test is true """
+        return
+
+
 class TestSeqLastPool2D(TestSeqAvgPool2D):
     def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-- 
GitLab