Finish the SequenceSoftmaxGradKernel, using SoftmaxGradFunctor.

03897f25 · Liu Yiqun · 05ed8ee8 · 03897f25 · 03897f25 · 03897f25
4 changed file
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include "paddle/operators/mul_op.h"

@@ -35,12 +35,14 @@ class MulOp : public framework::OperatorWithKernel {
    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
    int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");

-    PADDLE_ENFORCE(x_dims.size() > x_num_col_dims,
-                   "The rank of input tensor X should be larger than "
-                   "`mul_op`'s `x_num_col_dims`.");
-    PADDLE_ENFORCE(y_dims.size() > y_num_col_dims,
-                   "The rank of input tensor Y should be larger than "
-                   "`mul_op`'s `y_num_col_dims`.");
+    PADDLE_ENFORCE_GT(
+        x_dims.size(), x_num_col_dims,
+        "The input tensor X's rank of MulOp should be larger than "
+        "x_num_col_dims.");
+    PADDLE_ENFORCE_GT(
+        y_dims.size(), y_num_col_dims,
+        "The input tensor Y's rank of MulOp should be larger than "
+        "y_num_col_dims.");

    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);

--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -22,41 +22,42 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"), "Input(X) of SequenceSoftmaxOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of SequenceSoftmaxOp should not be null.");
-
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto lod = x->lod();
-    auto dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        dims[0],
-        /* batch_size */ static_cast<int64_t>(lod[0].size() - 1),
-        "The first dimension of Input(X) should be larger than batch size.");
-
-    const size_t level = lod.size() - 1;
-    PADDLE_ENFORCE_EQ(x->numel(), static_cast<int64_t>(lod[level].back()),
-                      "The width of each timestep in Input(X) of "
-                      "SequenceSoftmaxOp should be 1.");
-
-    std::cout << DebugString() << std::endl;
-
-    ctx.Output<framework::LoDTensor>("Out")->Resize({dims});
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };

 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceSoftmaxOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  SequenceSoftmaxOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor)");
-    AddOutput("Out", "(LoDTensor)");
+    AddInput("X",
+             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
+             "of length 1.");
+    AddOutput("Out",
+              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
+              "of length 1.");
    AddComment(R"DOC(
-Softmax of Sequence.
+SequenceSoftmaxOp computes softmax activation among all time-steps for each
+sequences. The dimension of each time-step should be 1. Thus, the shape of
+input Tensor can be either [N, 1] or [N], where N is the sum of all sequences'
+length.
+
+Equation:
+    for i-th sequence in mini-batch:
+        Out(X[lod[i]:lod[i+1]], :) =
+            exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :]))
+
+For example, for a mini-batch of 3 sequences with variable-length,
+each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+then softmax will be computed among X[0:2, :], X[2:5, :], X[2:7, :]
+and N turns out to be 7.
 )DOC");
  }
 };
@@ -66,7 +67,25 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
+
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
+        "the same shape.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
 };

 }  // namespace operators
@@ -81,4 +100,4 @@ REGISTER_OP_CPU_KERNEL(
    ops::SequenceSoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -16,19 +16,13 @@ limitations under the License. */

 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/softmax.h"

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

 template <typename Place, typename T>
 class SequenceSoftmaxKernel : public framework::OpKernel {
@@ -38,7 +32,17 @@ class SequenceSoftmaxKernel : public framework::OpKernel {
    auto* out = ctx.Output<LoDTensor>("Out");

    auto lod = x->lod();
+    auto dims = x->dims();
+
+    PADDLE_ENFORCE_GE(
+        dims[0],
+        /* batch_size */ static_cast<int64_t>(lod[0].size() - 1),
+        "The first dimension of Input(X) should be larger than batch size.");
+
    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(x->numel(), static_cast<int64_t>(lod[level].back()),
+                      "The width of each timestep in Input(X) of "
+                      "SequenceSoftmaxOp should be 1.");

    out->mutable_data<T>(ctx.GetPlace());
    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
@@ -48,10 +52,10 @@ class SequenceSoftmaxKernel : public framework::OpKernel {
      Tensor out_i = out->Slice<T>(start_pos, end_pos);

      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims = framework::make_ddim({1UL, end_pos - start_pos});
-      x_i.Resize(dims);
-      out_i.Resize(dims);
-      math::SoftmaxFunctor<Place, T>()(&x_i, &out_i, ctx);
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      math::SoftmaxFunctor<Place, T>()(ctx, &x_i, &out_i);
    }
  }
 };
@@ -59,7 +63,32 @@ class SequenceSoftmaxKernel : public framework::OpKernel {
 template <typename Place, typename T>
 class SequenceSoftmaxGradKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    auto lod = x->lod();
+    const size_t level = lod.size() - 1;
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+
+      Tensor out_i = out->Slice<T>(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice<T>(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice<T>(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      math::SoftmaxGradFunctor<Place, T>()(ctx, &out_i, &out_grad_i, &x_grad_i);
+    }
+  }
 };

 }  // namespace operators

--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
@@ -5,7 +5,7 @@ from op_test import OpTest

 def stable_softmax(x):
    """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x)
+    shiftx = x - np.max(x).clip(-64.)
    exps = np.exp(shiftx)
    return exps / np.sum(exps)

@@ -30,6 +30,9 @@ class TestSequenceSoftmaxOp(OpTest):
    def test_check_output(self):
        self.check_output()

+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.01)
+

 if __name__ == "__main__":
    unittest.main()