change staticRNN to while (#48213)

* change staticRNN to while * update code * fix rnn bug * update * fix _find_op_path_ bugs in append_backward. * polish code * revert op proto * update * udpate while * format * revert test while loop op * fix create array * fix windows error * fix bug * update * fix array write bug Co-authored-by: N xiongkun <xiongkun03@baidu.com>

change staticRNN to while (#48213)
* change staticRNN to while * update code * fix rnn bug * update * fix _find_op_path_ bugs in append_backward. * polish code * revert op proto * update * udpate while * format * revert test while loop op * fix create array * fix windows error * fix bug * update * fix array write bug Co-authored-by: N xiongkun <xiongkun03@baidu.com>
69536892 · hong · GitHub · c4f30c51 · 69536892 · 69536892
5 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,4 @@ paddle/fluid/pybind/tmp_eager_op_function_impl.h
 paddle/fluid/pybind/eager_op_function_impl.h
 paddle/fluid/pybind/eager_op_function_impl.h
 paddle/fluid/pybind/op_function_impl.h
+paddle/fluid/pybind/*final_state_op_function_impl.h
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -305,6 +305,7 @@ class WhileOp : public framework::OperatorBase {
        cond_data = GetCondData(
            scope.FindVar(Input(kCondition))->Get<phi::DenseTensor>());
      }
+
      scope.DeleteScope(&current_scope);
    }
  }
@@ -367,6 +368,7 @@ class WhileGradOp : public framework::OperatorBase {

    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
+    auto *parent_block = block->ParentBlock();

    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
@@ -428,6 +430,35 @@ class WhileGradOp : public framework::OperatorBase {
          continue;
        }

+        if (cur_scope_iter == step_scopes->rbegin()) {
+          auto &og_outside = *scope.FindVar(outside_og_name);
+          if (og_outside.IsType<phi::DenseTensor>() &&
+              !og_outside.GetMutable<phi::DenseTensor>()->IsInitialized()) {
+            auto *var_desc = parent_block->FindVarRecursive(outside_og_name);
+            PADDLE_ENFORCE_NOT_NULL(var_desc,
+                                    platform::errors::PreconditionNotMet(
+                                        "Var `%s` is not found in parent "
+                                        "block, can't fill constant.",
+                                        outside_og_name));
+            auto shape = var_desc->GetShape();
+            VLOG(8) << "Found uninitialized tensor " << outside_og_name
+                    << " in step 0, fill it with 0.0f. dims="
+                    << phi::make_ddim(shape);
+            framework::AttributeMap attrs;
+            attrs["dtype"] = var_desc->GetDataType();
+            attrs["shape"] = phi::vectorize<int>(phi::make_ddim(shape));
+            attrs["value"] = 0.0f;
+
+            auto var_name = outside_og_name;
+            auto zero_op =
+                framework::OpRegistry::CreateOp("fill_constant",
+                                                framework::VariableNameMap{},
+                                                {{"Out", {var_name}}},
+                                                attrs);
+            zero_op->Run(scope, dev_place);
+          }
+        }
+
        auto &og_outside = *scope.FindVar(outside_og_name);
        auto &og_inside = *cur_scope.Var(inside_og_name);
        if (og_outside.IsType<phi::DenseTensor>()) {
@@ -534,9 +565,10 @@ class WhileGradOp : public framework::OperatorBase {
        //    continue;
        //  }

-        auto var_iter = std::find(outside_og_names.begin(),
-                                  outside_og_names.end(),
-                                  pg_ig_names[param_id]);
+        auto is_var_input_and_output =
+            std::find(outside_og_names.begin(),
+                      outside_og_names.end(),
+                      pg_ig_names[param_id]) != outside_og_names.end();

        // zero gradient variable in step 0
        if (cur_scope_iter == step_scopes->rbegin()) {
@@ -555,8 +587,7 @@ class WhileGradOp : public framework::OperatorBase {
                  inside_grad_name,
                  framework::ToTypeName(var->Type())));

-          if ((var_iter == outside_og_names.end()) &&
-              var->IsType<phi::DenseTensor>()) {
+          if (!is_var_input_and_output && var->IsType<phi::DenseTensor>()) {
            auto &inside_tensor = var->Get<phi::DenseTensor>();
            framework::AttributeMap attrs;
            attrs["dtype"] =
@@ -575,10 +606,7 @@ class WhileGradOp : public framework::OperatorBase {
                inside_tensor.lod());
          }
        }
-        auto var_outside = scope.FindVar(pg_ig_names[param_id]);
-        if ((var_iter == outside_og_names.end()) ||
-            ((var_iter != outside_og_names.end()) &&
-             var_outside->IsType<framework::LoDTensorArray>())) {
+        if (!is_var_input_and_output) {
          auto new_inside_name = cur_scope.Rename(inside_grad_name);
          auto sum_op = framework::OpRegistry::CreateOp(
              "sum",
@@ -587,6 +615,8 @@ class WhileGradOp : public framework::OperatorBase {
              framework::AttributeMap{{"use_mkldnn", {false}}});
          sum_op->Run(cur_scope, dev_place);
          cur_scope.Rename(new_inside_name, inside_grad_name);
+        } else {
+          ShareVariable(cur_scope, scope, pg_ig_names[param_id]);
        }
      }
      dev_ctx.Wait();
@@ -595,6 +625,29 @@ class WhileGradOp : public framework::OperatorBase {
    step_scopes->clear();
  }

+  void ShareVariable(const framework::Scope &source,
+                     const framework::Scope &dest,
+                     std::string name) const {
+    auto from_var = source.FindVar(name);
+    auto to_var = dest.FindVar(name);
+    if (from_var->IsType<phi::DenseTensor>()) {
+      if (from_var->Get<phi::DenseTensor>().IsInitialized()) {
+        to_var->GetMutable<phi::DenseTensor>()->ShareDataWith(
+            from_var->Get<phi::DenseTensor>());
+      }
+    } else if (from_var->IsType<framework::LoDTensorArray>()) {
+      auto from_arr = from_var->GetMutable<framework::LoDTensorArray>();
+      auto to_arr = to_var->GetMutable<framework::LoDTensorArray>();
+      to_arr->clear();
+      to_arr->resize(from_arr->size());
+      for (size_t i = 0; i < to_arr->size(); ++i) {
+        if (from_arr->at(i).IsInitialized()) {
+          to_arr->at(i).ShareDataWith(from_arr->at(i));
+        }
+      }
+    }
+  }
+
 private:
  mutable std::shared_ptr<framework::Executor> executor_{nullptr};
  mutable std::unique_ptr<framework::ExecutorPrepareContext> ctx_{nullptr};
@@ -646,6 +699,7 @@ class WhileGradOpMaker : public framework::SingleGradOpMaker<T> {
      block_ins.insert(o);
    }
    std::unordered_set<std::string> output_grads;
+
    for (const auto *op : grad_block->AllOps()) {
      for (auto &input_name : op->InputArgumentNames()) {
        // If the input of Op has been recorded or is generated by the forward
@@ -658,7 +712,6 @@ class WhileGradOpMaker : public framework::SingleGradOpMaker<T> {
             parent_block->FindVarRecursive(input_name) != nullptr)) {
          continue;
        }
-
        output_grads.insert(input_name);
      }
      for (auto &output_name : op->OutputArgumentNames()) {

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -2220,6 +2220,10 @@ def _find_op_path_(
                op.desc.output_arg_names(), output_names
            ):
                relevant_op_flags[i] = True
+                if core.has_non_empty_grad_op_maker(op.type):
+                    for name in op.desc.input_arg_names():
+                        if name not in no_grad_set:
+                            output_names.add(name)

    op_path = [
        block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]

--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_api.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_api.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+
+paddle.set_default_dtype("float64")
+
+import unittest
+
+import numpy as np
+
+from paddle import fluid
+from paddle.fluid import framework
+
+bidirectional_list = ["bidirectional", "bidirect"]
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+        self.batch_size = 4
+        self.input_size = 16
+        self.hidden_size = 16
+        self.seq_len = 12
+        self.seed = 1234
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+
+        place = paddle.set_device(self.place)
+        paddle.disable_static(self.place)
+        paddle.seed(self.seed)
+        paddle.framework.random._manual_program_seed(self.seed)
+        cell_dy = paddle.nn.SimpleRNNCell(self.input_size, self.hidden_size)
+        self.rnn_net = paddle.nn.RNN(cell_dy, time_major=self.time_major)
+
+        paddle.enable_static()
+
+        with paddle.fluid.unique_name.guard():
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+            with paddle.static.program_guard(
+                main_program=main_program, startup_program=startup_program
+            ):
+                paddle.seed(self.seed)
+                paddle.framework.random._manual_program_seed(self.seed)
+
+                self.exe = fluid.Executor(
+                    fluid.CPUPlace()
+                    if self.place == "cpu"
+                    else fluid.CUDAPlace(0)
+                )
+
+                rnn_in_data = paddle.static.data(
+                    "x",
+                    [None, self.batch_size, self.hidden_size],
+                    dtype="float64",
+                )
+                pre_h_data = paddle.static.data(
+                    "pre_h",
+                    [self.batch_size, self.hidden_size],
+                    dtype="float64",
+                )
+                seq_len_data = paddle.static.data(
+                    "seq_len", [self.batch_size], dtype="int64"
+                )
+                cell_st = paddle.nn.SimpleRNNCell(
+                    self.input_size, self.hidden_size
+                )
+                self.rnn_st = paddle.nn.RNN(cell_st, time_major=self.time_major)
+                st_out, st_last_h = self.rnn_st(
+                    rnn_in_data, pre_h_data, sequence_length=seq_len_data
+                )
+
+                self.fetch_list = [st_out, st_last_h]
+
+                self.exe.run(framework.default_startup_program())
+
+                self.main_program = framework.default_main_program()
+
+            paddle.disable_static(self.place)
+
+    def test_base(self, test_seq_len=False):
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(4, 16)
+
+        paddle.disable_static(self.place)
+        if test_seq_len:
+            seq_len = np.array([9, 10, 8, 12], "int64")
+        else:
+            seq_len = np.array([12, 12, 12, 12], "int64")
+
+        y1, h1 = self.rnn_net(
+            paddle.to_tensor(x),
+            paddle.to_tensor(prev_h),
+            sequence_length=paddle.to_tensor(seq_len),
+        )
+
+        paddle.enable_static()
+        out = self.exe.run(
+            self.main_program,
+            feed={"x": x, "pre_h": prev_h, "seq_len": seq_len},
+            fetch_list=[self.fetch_list],
+        )
+
+        y2, h2 = out
+
+        np.testing.assert_allclose(y1.numpy(), y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1.numpy(), h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_base()
+        self.test_base(True)
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+        self.batch_size = 4
+        self.input_size = 16
+        self.hidden_size = 16
+        self.seq_len = 12
+        self.seed = 1234
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+
+        place = paddle.set_device(self.place)
+        paddle.disable_static(self.place)
+        paddle.seed(self.seed)
+        paddle.framework.random._manual_program_seed(self.seed)
+        cell_dy = paddle.nn.GRUCell(self.input_size, self.hidden_size)
+        self.rnn_net = paddle.nn.RNN(cell_dy, time_major=self.time_major)
+
+        paddle.enable_static()
+
+        with paddle.fluid.unique_name.guard():
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+            with paddle.static.program_guard(
+                main_program=main_program, startup_program=startup_program
+            ):
+                paddle.seed(self.seed)
+                paddle.framework.random._manual_program_seed(self.seed)
+
+                self.exe = fluid.Executor(
+                    fluid.CPUPlace()
+                    if self.place == "cpu"
+                    else fluid.CUDAPlace(0)
+                )
+
+                rnn_in_data = paddle.static.data(
+                    "x",
+                    [None, self.batch_size, self.hidden_size],
+                    dtype="float64",
+                )
+                pre_h_data = paddle.static.data(
+                    "pre_h",
+                    [self.batch_size, self.hidden_size],
+                    dtype="float64",
+                )
+                seq_len_data = paddle.static.data(
+                    "seq_len", [self.batch_size], dtype="int64"
+                )
+                cell_st = paddle.nn.GRUCell(self.input_size, self.hidden_size)
+                self.rnn_st = paddle.nn.RNN(cell_st, time_major=self.time_major)
+                st_out, st_last_h = self.rnn_st(
+                    rnn_in_data, pre_h_data, sequence_length=seq_len_data
+                )
+
+                self.fetch_list = [st_out, st_last_h]
+
+                self.exe.run(framework.default_startup_program())
+
+                self.main_program = framework.default_main_program()
+
+            paddle.disable_static(self.place)
+
+    def test_base(self, test_seq_len=False):
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(4, 16)
+
+        paddle.disable_static(self.place)
+        if test_seq_len:
+            seq_len = np.array([9, 10, 8, 12], "int64")
+        else:
+            seq_len = np.array([12, 12, 12, 12], "int64")
+
+        y1, h1 = self.rnn_net(
+            paddle.to_tensor(x),
+            paddle.to_tensor(prev_h),
+            sequence_length=paddle.to_tensor(seq_len),
+        )
+
+        paddle.enable_static()
+        out = self.exe.run(
+            self.main_program,
+            feed={"x": x, "pre_h": prev_h, "seq_len": seq_len},
+            fetch_list=[self.fetch_list],
+        )
+
+        y2, h2 = out
+
+        np.testing.assert_allclose(y1.numpy(), y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1.numpy(), h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_base()
+        self.test_base(True)
+
+
+class TestGRUBackward(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+        self.batch_size = 4
+        self.input_size = 4
+        self.hidden_size = 4
+        self.seq_len = 12
+        self.seed = 1234
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+
+        place = paddle.set_device(self.place)
+        paddle.disable_static(self.place)
+        paddle.seed(self.seed)
+        paddle.framework.random._manual_program_seed(self.seed)
+        cell_dy = paddle.nn.SimpleRNNCell(self.input_size, self.hidden_size)
+        self.rnn_net = paddle.nn.RNN(cell_dy, time_major=self.time_major)
+
+        paddle.enable_static()
+
+        with paddle.fluid.unique_name.guard():
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+            with paddle.static.program_guard(
+                main_program=main_program, startup_program=startup_program
+            ):
+                paddle.seed(self.seed)
+                paddle.framework.random._manual_program_seed(self.seed)
+
+                self.exe = paddle.fluid.Executor(
+                    fluid.CPUPlace()
+                    if self.place == "cpu"
+                    else fluid.CUDAPlace(0)
+                )
+
+                rnn_in_data = paddle.static.data(
+                    "x",
+                    [None, self.batch_size, self.hidden_size],
+                    dtype="float64",
+                )
+                pre_h_data = paddle.static.data(
+                    "pre_h",
+                    [self.batch_size, self.hidden_size],
+                    dtype="float64",
+                )
+                seq_len_data = paddle.static.data(
+                    "seq_len", [self.batch_size], dtype="int64"
+                )
+
+                pre_h_data.stop_gradient = False
+                rnn_in_data.stop_gradient = False
+
+                cell_st = paddle.nn.SimpleRNNCell(
+                    self.input_size, self.hidden_size
+                )
+                self.rnn_st = paddle.nn.RNN(cell_st, time_major=self.time_major)
+
+                st_out, st_last_h = self.rnn_st(
+                    rnn_in_data, pre_h_data, sequence_length=seq_len_data
+                )
+                loss = paddle.sum(st_out)
+                sgd = paddle.optimizer.SGD(0.0)
+                sgd.minimize(loss)
+                self.fetch_list = [st_out, st_last_h, "pre_h@GRAD", "x@GRAD"]
+
+                self.exe.run(framework.default_startup_program())
+
+                self.main_program = framework.default_main_program()
+
+            paddle.disable_static(self.place)
+
+    def test_base(self, test_seq_len=False):
+        x = np.random.randn(12, 4, self.hidden_size)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(4, self.hidden_size)
+
+        paddle.disable_static(self.place)
+        if test_seq_len:
+            seq_len = np.array([9, 10, 8, 12], "int64")
+        else:
+            seq_len = np.array([12, 12, 12, 12], "int64")
+
+        x_in = paddle.to_tensor(x)
+        h_in = paddle.to_tensor(prev_h)
+        x_in.stop_gradient = False
+        h_in.stop_gradient = False
+        y1, h1 = self.rnn_net(
+            x_in,
+            h_in,
+            sequence_length=paddle.to_tensor(seq_len),
+        )
+        loss = y1.sum()
+        loss.backward()
+
+        h1_grad = h_in.gradient()
+
+        paddle.enable_static()
+        out = self.exe.run(
+            self.main_program,
+            feed={"x": x, "pre_h": prev_h, "seq_len": seq_len},
+            fetch_list=[self.fetch_list],
+        )
+
+        y2, h2, g1, g2 = out
+
+        np.testing.assert_allclose(h1_grad, g1, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+
+        self.test_base(True)
+        self.test_base()
+        self.test_base()
+        self.test_base(True)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -248,8 +248,8 @@ def _rnn_static_graph(
    if not time_major:
        inputs = map_structure(_transpose_batch_time, inputs)

+    max_seq_len = paddle.shape(flatten(inputs)[0])[0]
    if sequence_length:
-        max_seq_len = paddle.shape(flatten(inputs)[0])[0]
        mask = sequence_lod.sequence_mask(
            sequence_length,
            maxlen=max_seq_len,
@@ -260,30 +260,77 @@ def _rnn_static_graph(
        inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs)
        mask = paddle.reverse(mask, axis=[0]) if sequence_length else None

-    # StaticRNN
-    rnn = control_flow.StaticRNN()
-    with rnn.step():
-        inputs = map_structure(rnn.step_input, inputs)
-        states = map_structure(rnn.memory, initial_states)
-        copy_states = map_structure(lambda x: x, states)
-        outputs, new_states = cell(inputs, copy_states, **kwargs)
-        utils.assert_same_structure(states, new_states)
+    with paddle.fluid.framework.device_guard("cpu"):
+        start_i = paddle.zeros([1], dtype="int64")
+        end = max_seq_len
+
+        end = paddle.cast(end, "int64")
+        cond = start_i < end
+    while_op = control_flow.While(cond)
+
+    out_array = paddle.tensor.create_array(dtype=flatten(inputs)[0].dtype)
+
+    init_array = map_structure(
+        lambda x: paddle.tensor.create_array(dtype=x.dtype), initial_states
+    )
+
+    map_structure(
+        lambda x, y: paddle.tensor.array_write(x, start_i, y),
+        initial_states,
+        init_array,
+    )
+
+    with while_op.block():
+
+        step_in = inputs[start_i]
+        # step_in = paddle.fluid.layers.Print( step_in, message="step in")
+        pre_state = map_structure(
+            lambda x: paddle.tensor.array_read(x, start_i), init_array
+        )
+        # pre_state = paddle.fluid.layers.Print( pre_state, message="pre")
+        outputs, new_states = cell(step_in, pre_state, **kwargs)
+        assert isinstance(outputs, paddle.fluid.framework.Variable)
+        utils.assert_same_structure(new_states, pre_state)
        if sequence_length:
-            step_mask = rnn.step_input(mask)
+            step_mask = paddle.unsqueeze(mask[start_i], 1)
+            # paddle.fluid.layers.Print( step_mask, message="mask")
+            # new_states = map_structure(
+            #     partial(_maybe_copy, step_mask=step_mask),
+            #     pre_state, new_states
+            # )
            new_states = map_structure(
-                partial(_maybe_copy, step_mask=step_mask), states, new_states
+                lambda x, y: (x * step_mask + y * (1.0 - step_mask)),
+                new_states,
+                pre_state,
            )

-        map_structure(rnn.update_memory, states, new_states)
-        flat_outputs = flatten(outputs)
-        map_structure(rnn.step_output, outputs)
-        map_structure(rnn.step_output, new_states)
+        paddle.tensor.array_write(outputs, start_i, out_array)
+
+        with paddle.fluid.framework.device_guard("cpu"):
+
+            start_i = paddle.tensor.increment(x=start_i, value=1)
+        map_structure(
+            lambda x, y: paddle.tensor.array_write(x, start_i, y),
+            new_states,
+            init_array,
+        )
+
+        with paddle.fluid.framework.device_guard("cpu"):
+            new_cond = paddle.tensor.less_than(start_i, end)
+            paddle.fluid.layers.assign(new_cond, cond)
+
+    out, _ = paddle.fluid.layers.tensor_array_to_tensor(
+        out_array, axis=0, use_stack=True
+    )

-    rnn_out = rnn()
-    final_outputs = rnn_out[: len(flat_outputs)]
-    final_outputs = utils.pack_sequence_as(outputs, final_outputs)
-    final_states = map_structure(lambda x: x[-1], rnn_out[len(flat_outputs) :])
-    final_states = utils.pack_sequence_as(new_states, final_states)
+    all_state = map_structure(
+        lambda x: paddle.fluid.layers.tensor_array_to_tensor(
+            x, axis=0, use_stack=True
+        )[0],
+        init_array,
+    )
+    final_outputs = out
+    final_states = map_structure(lambda x: x[-1], all_state)

    if is_reverse:
        final_outputs = map_structure(