Add rnn_op (#28197)

* Add rnn_op. test=develop * Fix rnn_op grad maker's drop_empty_grad. test=develop

Add rnn_op (#28197)
* Add rnn_op. test=develop * Fix rnn_op grad maker's drop_empty_grad. test=develop
9a600df3 · Guo Sheng · GitHub · 0f4b6247 · 9a600df3 · 9a600df3
5 changed file
--- a/paddle/fluid/operators/rnn_op.cc
+++ b/paddle/fluid/operators/rnn_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+namespace paddle {
+namespace operators {
+class RNNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "RNN");
+    OP_INOUT_CHECK(ctx->HasInputs("PreState"), "Input", "PreState", "RNN");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "RNN");
+    OP_INOUT_CHECK(ctx->HasOutputs("State"), "Output", "State", "RNN");
+    auto in_dims = ctx->GetInputDim("Input");
+    auto pre_state_dims = ctx->GetInputsDim("PreState");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of Input in RNN  must be 3. But "
+                          "received Input's rank is %d.",
+                          in_dims.size()));
+    if (ctx->HasInput("SequenceLength")) {
+      auto seq_dims = ctx->GetInputDim("SequenceLength");
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], seq_dims[0],
+          platform::errors::InvalidArgument(
+              "The size of SequenceLength has to equal the batch_size. But "
+              "received batch_size is %d and the size of SequenceLength is %d.",
+              in_dims[1], seq_dims[0]));
+    }
+    PADDLE_ENFORCE_EQ(pre_state_dims[0].size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of PreState in RNN  must be 3. But "
+                          "the received rank is %d.",
+                          pre_state_dims[0].size()));
+    size_t i = 0;
+    for (; i < pre_state_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], pre_state_dims[i][1],
+          platform::errors::InvalidArgument(
+              "The second dimension size (representing for batch size) of "
+              "Input and PreState should be equal. But received %d and %d.",
+              in_dims[1], pre_state_dims[i][1]));
+      PADDLE_ENFORCE_EQ(
+          pre_state_dims[0], pre_state_dims[i],
+          platform::errors::InvalidArgument(
+              "The dims of all tensors in PreState should be same. But "
+              "received PreState[0] is %s and PreState[%d] is %s.",
+              pre_state_dims[0], i, pre_state_dims[i]));
+    }
+    auto mode = ctx->Attrs().Get<std::string>("mode");
+    size_t num_state = mode == "LSTM" ? 2 : 1;
+    PADDLE_ENFORCE_EQ(
+        i, num_state,
+        platform::errors::InvalidArgument(
+            "The number of tensors in PreState of %s should be %d, "
+            "but received %d.",
+            mode, 2, i));
+    auto out_dims = in_dims;
+    auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
+    bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
+    out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputsDim("State", pre_state_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
+  }
+};
+class RNNOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor) RNN input tensor, which support variable-time length input "
+        "sequence."
+        "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
+        "seq_len is the total time step in this mini-batch (CAN be change in "
+        "different batch)"
+        "batch_size is the instance number of this batch"
+        "input_size is the hidden size of the input."
+        "input_size and the hidden_size in the next may not be same");
+    AddInput("PreState",
+             "(Tensor) the initial hidden state of the LSTM"
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)")
+        .AsDuplicable();
+    AddInput("WeightList",
+             "(vector<Tensor>), stores weight and bias data when the weight "
+             "use the list format. ")
+        .AsDuplicable();
+    AddInput("SequenceLength",
+             "(Tensor) When the input data is padding, "
+             "set this parameter. This parameter represents "
+             "the variable sequence lengths in a batch. "
+             "The size of the vector has to equal the batch_size.")
+        .AsDispensable();
+    AddOutput("DropoutState",
+              "Store the global drop state when training, needed by cudnn rnn.")
+        .AsDispensable();
+    // maybe need add intermediate outputs for cpu kernel
+    AddOutput("Reserve",
+              "(Tensor, a temporary output Tensor to store the reserve_data "
+              "of cudnn kernel.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(Tensor) the hidden state of LSTM operator. "
+              "The shape is ( seq_len x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be ( seq_len x "
+              "batch_size x hidden_size * 2) ");
+    AddOutput("State",
+              "(Tensor) the hidden state of the last step. "
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size)")
+        .AsDuplicable();
+    AddAttr<float>(
+        "dropout_prob",
+        "dropout prob of the dropout op"
+        "the dropout ONLY work between rnn layers, not between time steps"
+        "There is no dropout work on the Out tensor")
+        .SetDefault(0.0);
+    AddAttr<bool>("is_bidirec", "whether it is bidirectional rnn")
+        .SetDefault(false);
+    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
+    AddAttr<int>("hidden_size", "hidden size of rnn").SetDefault(100);
+    AddAttr<int>("num_layers", "the total layer number").SetDefault(1);
+    AddAttr<std::string>(
+        "mode",
+        "(string) rnn types, including: LSTM, GRU, RNN_RELU, RNN_TANH.");
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+class RNNGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "RNN");
+    OP_INOUT_CHECK(ctx->HasInputs("PreState"), "Input", "PreState", "RNN");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "RNN");
+    // OP_INOUT_CHECK(ctx->HasInputs("State"), "Input", "State", "RNN");
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name)) {
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+      }
+    };
+    SetOutGradDim("Input");
+    if (ctx->HasOutputs(framework::GradVarName("WeightList"))) {
+      ctx->SetOutputsDim(framework::GradVarName("WeightList"),
+                         ctx->GetInputsDim("WeightList"));
+    }
+    if (ctx->HasOutputs(framework::GradVarName("PreState"))) {
+      ctx->SetOutputsDim(framework::GradVarName("PreState"),
+                         ctx->GetInputsDim("PreState"));
+    }
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+template <typename T>
+class RNNGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("rnn_grad");
+    op->SetInput("Input", this->Input("Input"));
+    op->SetInput("PreState", this->Input("PreState"));
+    op->SetInput("WeightList", this->Input("WeightList"));
+    if (this->HasInput("SequenceLength")) {
+      op->SetInput("SequenceLength", this->Input("SequenceLength"));
+    }
+    op->SetInput("DropoutState", this->Output("DropoutState"));
+    op->SetInput("Reserve", this->Output("Reserve"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput(framework::GradVarName("State"), this->OutputGrad("State"));
+    op->SetOutput(framework::GradVarName("WeightList"),
+                  this->InputGrad("WeightList", false));
+    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("PreState"),
+                  this->InputGrad("PreState", false));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+template <typename T>
+class NotImpleKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "CPU is not support for this kernel now. Will be add in the future"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(rnn, ops::RNNOp, ops::RNNOpMaker,
+                  ops::RNNGradOpMaker<paddle::framework::OpDesc>,
+                  ops::RNNGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(rnn_grad, ops::RNNGradOp);
+REGISTER_OP_CPU_KERNEL(rnn, ops::NotImpleKernel<float>);
+REGISTER_OP_CPU_KERNEL(rnn_grad, ops::NotImpleKernel<float>);
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -361,6 +361,12 @@ class ScopedDropoutDescriptor {
                                             float dropout_prob_,
                                             framework::Tensor* dropout_state_,
                                             int seed, size_t state_size) {
+    if (dropout_state_ == nullptr) {  // for no dropout or test
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+          desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */,
+          0 /* seed */));
+      return desc_;
+    }
    auto* dropout_state_data = dropout_state_->data<uint8_t>();
    if (!initialized) {
      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(

--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -93,10 +93,14 @@ class TestSimpleRNN(unittest.TestCase):
        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+    def test_predict(self):
+        predict_test_util(self.place, "SimpleRNN")
    def runTest(self):
        self.test_with_initial_state()
        self.test_with_zero_state()
        self.test_with_input_lengths()
+        self.test_predict()
 class TestGRU(unittest.TestCase):
@@ -175,10 +179,14 @@ class TestGRU(unittest.TestCase):
        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+    def test_predict(self):
+        predict_test_util(self.place, "GRU")
    def runTest(self):
        self.test_with_initial_state()
        self.test_with_zero_state()
        self.test_with_input_lengths()
+        self.test_predict()
 class TestLSTM(unittest.TestCase):
@@ -258,61 +266,7 @@ class TestLSTM(unittest.TestCase):
        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
    def test_predict(self):
-        place = paddle.set_device(self.place)
+        predict_test_util(self.place, "LSTM")
-        paddle.seed(123)
-        np.random.seed(123)
-        class Net(paddle.nn.Layer):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.rnn1 = paddle.nn.LSTM(
-                    16, 32, 2, direction="bidirectional", dropout=0.1)
-            def forward(self, input):
-                return self.rnn1(input)
-        x = paddle.randn((4, 10, 16))
-        x.stop_gradient = False
-        seq_len = paddle.to_tensor(np.array([10, 6, 8, 5]))
-        mask = sequence_mask(seq_len, maxlen=10, dtype=x.dtype)
-        mask = paddle.unsqueeze(mask, [2])
-        rnn = Net()
-        y, (h, c) = rnn(x)
-        y = y * mask
-        loss = paddle.mean(y)
-        loss.backward()
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=0.1, parameters=rnn.parameters())
-        optimizer.step()
-        rnn.eval()
-        y, (h, c) = rnn(x)
-        # `jit.to_static` would include a train_program, eval mode might cause
-        # some errors currently, such as dropout grad op gets `is_test == True`.
-        rnn.train()
-        rnn = paddle.jit.to_static(
-            rnn,
-            [paddle.static.InputSpec(
-                shape=[None, None, 16], dtype=x.dtype)])
-        paddle.jit.save(rnn, "./inference/lstm_infer")
-        paddle.enable_static()
-        new_scope = paddle.static.Scope()
-        with paddle.static.scope_guard(new_scope):
-            exe = paddle.static.Executor(place)
-            [inference_program, feed_target_names,
-             fetch_targets] = paddle.static.load_inference_model(
-                 dirname="./inference",
-                 executor=exe,
-                 model_filename="lstm_infer.pdmodel",
-                 params_filename="lstm_infer.pdiparams")
-            results = exe.run(inference_program,
-                              feed={feed_target_names[0]: x.numpy()},
-                              fetch_list=fetch_targets)
-            np.testing.assert_equal(
-                y.numpy(), results[0])  # eval results equal predict results
-        paddle.disable_static()
    def runTest(self):
        self.test_with_initial_state()
@@ -321,6 +275,66 @@ class TestLSTM(unittest.TestCase):
        self.test_predict()
+def predict_test_util(place, mode):
+    place = paddle.set_device(place)
+    paddle.seed(123)
+    np.random.seed(123)
+    class Net(paddle.nn.Layer):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.rnn = getattr(paddle.nn, mode)(16,
+                                                32,
+                                                2,
+                                                direction="bidirectional",
+                                                dropout=0.1)
+        def forward(self, input):
+            return self.rnn(input)
+    x = paddle.randn((4, 10, 16))
+    x.stop_gradient = False
+    seq_len = paddle.to_tensor(np.array([10, 6, 8, 5]))
+    mask = sequence_mask(seq_len, maxlen=10, dtype=x.dtype)
+    mask = paddle.unsqueeze(mask, [2])
+    rnn = Net()
+    y, _ = rnn(x)
+    y = y * mask
+    loss = paddle.mean(y)
+    loss.backward()
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.1, parameters=rnn.parameters())
+    optimizer.step()
+    rnn.eval()
+    y, _ = rnn(x)
+    # `jit.to_static` would include a train_program, eval mode might cause
+    # some errors currently, such as dropout grad op gets `is_test == True`.
+    rnn.train()
+    rnn = paddle.jit.to_static(
+        rnn, [paddle.static.InputSpec(
+            shape=[None, None, 16], dtype=x.dtype)])
+    paddle.jit.save(rnn, "./inference/%s_infer" % mode)
+    paddle.enable_static()
+    new_scope = paddle.static.Scope()
+    with paddle.static.scope_guard(new_scope):
+        exe = paddle.static.Executor(place)
+        [inference_program, feed_target_names,
+         fetch_targets] = paddle.static.load_inference_model(
+             dirname="./inference",
+             executor=exe,
+             model_filename="%s_infer.pdmodel" % mode,
+             params_filename="%s_infer.pdiparams" % mode)
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: x.numpy()},
+                          fetch_list=fetch_targets)
+        np.testing.assert_equal(
+            y.numpy(), results[0])  # eval results equal predict results
+    paddle.disable_static()
 def load_tests(loader, tests, pattern):
    suite = unittest.TestSuite()
    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \

--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -990,7 +990,6 @@ class RNNBase(LayerList):
        self.could_use_cudnn &= direction != "backward"
        self.could_use_cudnn &= len(self.parameters()) == num_layers * 4 * (
            2 if direction == "bidirectional" else 1)
-        self.could_use_cudnn &= mode == "LSTM"  # currently only support LSTM
        # Expose params as RNN's attribute, which can make it compatible when
        # replacing small ops composed rnn with cpp rnn kernel.
@@ -1062,22 +1061,18 @@ class RNNBase(LayerList):
    def _cudnn_impl(self, inputs, initial_states, sequence_length):
        if not self.time_major:
            inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
-        # unify LSTM/GRU/SimpleRNN later, currently only support LSTM
-        # TODO(guosheng): use `core.ops.cudnn_lstm` in dygraph mode if support
-        # specify output, since `dropout_state` should be a persistable tensor
-        # rather than a temporary on.
        out = self._helper.create_variable_for_type_inference(inputs.dtype)
-        last_h = self._helper.create_variable_for_type_inference(inputs.dtype)
+        state = [
-        last_c = self._helper.create_variable_for_type_inference(inputs.dtype)
+            self._helper.create_variable_for_type_inference(inputs.dtype)
+            for i in range(self.state_components)
+        ]
        reserve = self._helper.create_variable_for_type_inference(
            dtype=fluid.core.VarDesc.VarType.UINT8, stop_gradient=True)
        inputs = {
            'Input': inputs,
-            # 'W': self._flat_weight,  # would be unused_var
            'WeightList': self._all_weights,
-            'InitH': initial_states[0],
+            'PreState': initial_states,
-            'InitC': initial_states[1],
            'SequenceLength': sequence_length
        }
        attrs = {
@@ -1086,23 +1081,22 @@ class RNNBase(LayerList):
            'input_size': self.input_size,
            'hidden_size': self.hidden_size,
            'num_layers': self.num_layers,
+            'mode': self.mode,
            'is_test': not self.training
        }
        outputs = {
            'Out': out,
-            'LastH': last_h,
+            'State': state,
-            'LastC': last_c,
            'Reserve': reserve,
-            'StateOut': self._dropout_state,
+            'DropoutState': self._dropout_state,
        }
        self._helper.append_op(
-            type="cudnn_lstm", inputs=inputs, outputs=outputs, attrs=attrs)
+            type="rnn", inputs=inputs, outputs=outputs, attrs=attrs)
        out = paddle.tensor.transpose(out,
                                      [1, 0, 2]) if not self.time_major else out
-        states = (last_h, last_c)
+        return out, tuple(state) if len(state) > 1 else state[0]
-        return out, states
    def forward(self, inputs, initial_states=None, sequence_length=None):
        batch_index = 1 if self.time_major else 0