rnn.py 59.0 KB
Newer Older
G
Guo Sheng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import sys
G
Guo Sheng 已提交
16
from functools import partial, reduce
J
Jiaqi Liu 已提交
17
import warnings
G
Guo Sheng 已提交
18

19

20
import paddle
21
from paddle.utils import deprecated
G
Guo Sheng 已提交
22 23 24 25
from . import nn
from . import tensor
from . import control_flow
from . import utils
26
from . import sequence_lod
G
Guo Sheng 已提交
27
from .utils import *
weixin_46829950's avatar
weixin_46829950 已提交
28
from .. import core
29 30
from ..framework import default_main_program
from ..data_feeder import convert_dtype
31
from ..layer_helper import LayerHelper
J
Jiabin Yang 已提交
32
from ..framework import _non_static_mode
33
from ..param_attr import ParamAttr
X
Xing Wu 已提交
34
from ..data_feeder import check_variable_and_dtype, check_type, check_dtype
35

36
from collections.abc import Sequence
G
Guo Sheng 已提交
37 38 39

__all__ = [
    'dynamic_decode',
40 41 42 43 44
    'dynamic_lstm',
    'dynamic_lstmp',
    'dynamic_gru',
    'gru_unit',
    'lstm',
G
Guo Sheng 已提交
45 46 47
]


48
class ArrayWrapper:
F
Feiyu Chan 已提交
49 50 51 52 53 54 55
    def __init__(self, x):
        self.array = [x]

    def append(self, x):
        self.array.append(x)
        return self

56 57 58
    def __getitem__(self, item):
        return self.array.__getitem__(item)

F
Feiyu Chan 已提交
59

60 61 62 63 64 65 66 67 68 69
def _dynamic_decode_imperative(
    decoder,
    inits=None,
    max_step_num=None,
    output_time_major=False,
    impute_finished=False,
    is_test=False,
    return_length=False,
    **kwargs
):
70 71 72 73 74 75 76 77 78 79 80
    def _maybe_copy(state, new_state, step_mask):
        # TODO: use where_op
        state_dtype = state.dtype
        if convert_dtype(state_dtype) in ["bool"]:
            state = tensor.cast(state, dtype="float32")
            new_state = tensor.cast(new_state, dtype="float32")
        if step_mask.dtype != state.dtype:
            step_mask = tensor.cast(step_mask, dtype=state.dtype)
            # otherwise, renamed bool gradients of would be summed up leading
            # to sum(bool) error.
            step_mask.stop_gradient = True
81
        new_state = paddle.tensor.math._multiply_with_axis(
82
            state, step_mask, axis=0
83 84 85
        ) - paddle.tensor.math._multiply_with_axis(
            new_state, (step_mask - 1), axis=0
        )
86 87 88
        if convert_dtype(state_dtype) in ["bool"]:
            new_state = tensor.cast(new_state, dtype=state_dtype)
        return new_state
S
swtkiwi 已提交
89

90
    initial_inputs, initial_states, initial_finished = decoder.initialize(inits)
91 92 93 94 95
    inputs, states, finished = (
        initial_inputs,
        initial_states,
        initial_finished,
    )
96
    cond = paddle.logical_not((paddle.all(initial_finished)))
97
    sequence_lengths = tensor.cast(paddle.zeros_like(initial_finished), "int64")
98 99 100
    outputs = None

    step_idx = 0
101 102 103
    step_idx_tensor = tensor.fill_constant(
        shape=[1], dtype="int64", value=step_idx
    )
104
    while cond.numpy():
105 106 107
        (step_outputs, next_states, next_inputs, next_finished) = decoder.step(
            step_idx_tensor, inputs, states, **kwargs
        )
108 109 110 111 112
        if not decoder.tracks_own_finished:
            # BeamSearchDecoder would track it own finished, since
            # beams would be reordered and the finished status of each
            # entry might change. Otherwise, perform logical OR which
            # would not change the already finished.
2
201716010711 已提交
113
            next_finished = paddle.logical_or(next_finished, finished)
114 115 116
            # To confirm states.finished/finished be consistent with
            # next_finished.
            tensor.assign(next_finished, finished)
117
            next_sequence_lengths = paddle.add(
J
Jiaqi Liu 已提交
118
                sequence_lengths,
119
                tensor.cast(
2
201716010711 已提交
120
                    paddle.logical_not(finished), sequence_lengths.dtype
121 122
                ),
            )
J
Jiaqi Liu 已提交
123 124
            if impute_finished:  # rectify the states for the finished.
                next_states = map_structure(
125 126 127 128
                    lambda x, y: _maybe_copy(x, y, finished),
                    states,
                    next_states,
                )
J
Jiaqi Liu 已提交
129 130 131 132
        else:
            warnings.warn(
                "`next_states` has no `lengths` attribute, the returned `sequence_lengths` would be all zeros."
            ) if not hasattr(next_states, "lengths") else None
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
            next_sequence_lengths = getattr(
                next_states, "lengths", sequence_lengths
            )

        outputs = (
            map_structure(lambda x: ArrayWrapper(x), step_outputs)
            if step_idx == 0
            else map_structure(
                lambda x, x_array: x_array.append(x), step_outputs, outputs
            )
        )
        inputs, states, finished, sequence_lengths = (
            next_inputs,
            next_states,
            next_finished,
            next_sequence_lengths,
        )
G
Guo Sheng 已提交
150

151
        paddle.increment(x=step_idx_tensor, value=1.0)
152
        step_idx += 1
G
Guo Sheng 已提交
153

154
        cond = paddle.logical_not(paddle.all(finished))
155 156
        if max_step_num is not None and step_idx > max_step_num:
            break
G
Guo Sheng 已提交
157

158 159 160
    final_outputs = map_structure(
        lambda x: paddle.stack(x.array, axis=0), outputs
    )
161
    final_states = states
G
Guo Sheng 已提交
162

163
    try:
164 165 166
        final_outputs, final_states = decoder.finalize(
            final_outputs, final_states, sequence_lengths
        )
167 168
    except NotImplementedError:
        pass
G
Guo Sheng 已提交
169

170 171
    if not output_time_major:
        final_outputs = map_structure(
172 173 174
            lambda x: paddle.transpose(
                x, [1, 0] + list(range(2, len(x.shape)))
            ),
175 176
            final_outputs,
        )
177

178 179 180 181 182
    return (
        (final_outputs, final_states, sequence_lengths)
        if return_length
        else (final_outputs, final_states)
    )
183 184


185 186 187 188 189 190 191 192 193 194
def _dynamic_decode_declarative(
    decoder,
    inits=None,
    max_step_num=None,
    output_time_major=False,
    impute_finished=False,
    is_test=False,
    return_length=False,
    **kwargs
):
G
Guo Sheng 已提交
195
    initial_inputs, initial_states, initial_finished = decoder.initialize(inits)
196 197 198 199 200
    global_inputs, global_states, global_finished = (
        initial_inputs,
        initial_states,
        initial_finished,
    )
201
    global_finished.stop_gradient = True
G
Guo Sheng 已提交
202
    step_idx = tensor.fill_constant(shape=[1], dtype="int64", value=0)
203

204
    cond = paddle.logical_not((paddle.all(initial_finished)))
G
Guo Sheng 已提交
205
    if max_step_num is not None:
206 207 208
        max_step_num = tensor.fill_constant(
            shape=[1], dtype="int64", value=max_step_num
        )
209
    while_op = paddle.static.nn.control_flow.While(cond, is_test=is_test)
G
Guo Sheng 已提交
210

211
    sequence_lengths = tensor.cast(paddle.zeros_like(initial_finished), "int64")
212 213 214 215 216 217 218 219 220
    sequence_lengths.stop_gradient = True

    if is_test:
        # for test, reuse inputs and states variables to save memory
        inputs = map_structure(lambda x: x, initial_inputs)
        states = map_structure(lambda x: x, initial_states)
    else:
        # inputs and states of all steps must be saved for backward and training
        inputs_arrays = map_structure(
221 222
            lambda x: control_flow.array_write(x, step_idx), initial_inputs
        )
223
        states_arrays = map_structure(
224 225
            lambda x: control_flow.array_write(x, step_idx), initial_states
        )
G
Guo Sheng 已提交
226 227 228

    def _maybe_copy(state, new_state, step_mask):
        # TODO: use where_op
229 230 231 232 233 234 235 236 237
        state_dtype = state.dtype
        if convert_dtype(state_dtype) in ["bool"]:
            state = tensor.cast(state, dtype="float32")
            new_state = tensor.cast(new_state, dtype="float32")
        if step_mask.dtype != state.dtype:
            step_mask = tensor.cast(step_mask, dtype=state.dtype)
            # otherwise, renamed bool gradients of would be summed up leading
            # to sum(bool) error.
            step_mask.stop_gradient = True
238
        new_state = paddle.tensor.math._multiply_with_axis(
239
            state, step_mask, axis=0
240 241 242
        ) - paddle.tensor.math._multiply_with_axis(
            new_state, (step_mask - 1), axis=0
        )
243 244
        if convert_dtype(state_dtype) in ["bool"]:
            new_state = tensor.cast(new_state, dtype=state_dtype)
G
Guo Sheng 已提交
245 246 247
        return new_state

    def _transpose_batch_time(x):
248
        return paddle.transpose(x, [1, 0] + list(range(2, len(x.shape))))
G
Guo Sheng 已提交
249

250 251
    def _create_array_out_of_while(dtype):
        current_block_idx = default_main_program().current_block_idx
252 253 254
        default_main_program().current_block_idx = (
            default_main_program().current_block().parent_idx
        )
255
        tensor_array = paddle.tensor.create_array(dtype)
256 257 258
        default_main_program().current_block_idx = current_block_idx
        return tensor_array

G
Guo Sheng 已提交
259 260
    # While
    with while_op.block():
261 262 263
        if not is_test:
            inputs = map_structure(
                lambda array: control_flow.array_read(array, step_idx),
264 265
                inputs_arrays,
            )
266 267
            states = map_structure(
                lambda array: control_flow.array_read(array, step_idx),
268 269 270 271 272
                states_arrays,
            )
        (outputs, next_states, next_inputs, next_finished) = decoder.step(
            step_idx, inputs, states, **kwargs
        )
273 274 275 276 277
        if not decoder.tracks_own_finished:
            # BeamSearchDecoder would track it own finished, since beams would
            # be reordered and the finished status of each entry might change.
            # Otherwise, perform logical OR which would not change the already
            # finished.
2
201716010711 已提交
278
            next_finished = paddle.logical_or(next_finished, global_finished)
279
            next_sequence_lengths = paddle.add(
J
Jiaqi Liu 已提交
280
                sequence_lengths,
281
                tensor.cast(
2
201716010711 已提交
282
                    paddle.logical_not(global_finished),
283 284 285
                    sequence_lengths.dtype,
                ),
            )
J
Jiaqi Liu 已提交
286 287 288 289
            if impute_finished:  # rectify the states for the finished.
                next_states = map_structure(
                    lambda x, y: _maybe_copy(x, y, global_finished),
                    states,
290 291
                    next_states,
                )
J
Jiaqi Liu 已提交
292 293 294 295
        else:
            warnings.warn(
                "`next_states` has no `lengths` attribute, the returned `sequence_lengths` would be all zeros."
            ) if not hasattr(next_states, "lengths") else None
296 297 298
            next_sequence_lengths = getattr(
                next_states, "lengths", sequence_lengths
            )
299 300 301

        # create tensor array in global block after dtype[s] of outputs can be got
        outputs_arrays = map_structure(
302 303
            lambda x: _create_array_out_of_while(x.dtype), outputs
        )
304

G
Guo Sheng 已提交
305 306
        map_structure(
            lambda x, x_array: control_flow.array_write(
307 308 309 310 311
                x, i=step_idx, array=x_array
            ),
            outputs,
            outputs_arrays,
        )
312 313

        paddle.increment(x=step_idx, value=1.0)
314 315 316 317
        # update the global_finished first, since it might be also in states of
        # decoder, which otherwise would write a stale finished status to array
        tensor.assign(next_finished, global_finished)
        tensor.assign(next_sequence_lengths, sequence_lengths)
318 319 320 321 322 323
        if is_test:
            map_structure(tensor.assign, next_inputs, global_inputs)
            map_structure(tensor.assign, next_states, global_states)
        else:
            map_structure(
                lambda x, x_array: control_flow.array_write(
324 325 326 327 328
                    x, i=step_idx, array=x_array
                ),
                next_inputs,
                inputs_arrays,
            )
329 330
            map_structure(
                lambda x, x_array: control_flow.array_write(
331 332 333 334 335
                    x, i=step_idx, array=x_array
                ),
                next_states,
                states_arrays,
            )
G
Guo Sheng 已提交
336
        if max_step_num is not None:
337
            paddle.logical_and(
338
                paddle.logical_not(paddle.all(global_finished)),
339
                paddle.less_equal(step_idx, max_step_num),
340 341
                cond,
            )
G
Guo Sheng 已提交
342
        else:
343
            paddle.logical_not(paddle.all(global_finished), cond)
G
Guo Sheng 已提交
344 345 346

    final_outputs = map_structure(
        lambda array: tensor.tensor_array_to_tensor(
347 348 349 350
            array, axis=0, use_stack=True
        )[0],
        outputs_arrays,
    )
351 352 353 354 355
    if is_test:
        final_states = global_states
    else:
        final_states = map_structure(
            lambda array: control_flow.array_read(array, step_idx),
356 357
            states_arrays,
        )
G
Guo Sheng 已提交
358 359

    try:
360 361 362
        final_outputs, final_states = decoder.finalize(
            final_outputs, final_states, sequence_lengths
        )
G
Guo Sheng 已提交
363 364 365 366 367 368
    except NotImplementedError:
        pass

    if not output_time_major:
        final_outputs = map_structure(_transpose_batch_time, final_outputs)

369 370 371 372 373
    return (
        (final_outputs, final_states, sequence_lengths)
        if return_length
        else (final_outputs, final_states)
    )
374 375


376 377 378 379 380 381 382 383 384 385
def dynamic_decode(
    decoder,
    inits=None,
    max_step_num=None,
    output_time_major=False,
    impute_finished=False,
    is_test=False,
    return_length=False,
    **kwargs
):
386
    r"""
387 388 389 390 391 392 393 394 395 396
    Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
    Tensor indicating finished status contains all True values or the number of
    decoding step reaches to :attr:`max_step_num`.

    :code:`decoder.initialize()` would be called once before the decoding loop.
    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
    would be called once after the decoding loop.

    Parameters:
        decoder(Decoder): An instance of `Decoder`.
397
        inits(object, optional): Argument passed to `decoder.initialize`.
398 399 400 401 402 403 404 405 406 407
            Default `None`.
        max_step_num(int, optional): The maximum number of steps. If not provided,
            decode until the decoder is fully done, or in other words, the returned
            Tensor by :code:`decoder.step()` indicating finished status contains
            all True. Default `None`.
        output_time_major(bool, optional): Indicate the data layout of Tensor included
            in the final outputs(the first returned value of this method). If
            attr:`False`, the data layout would be batch major with shape
            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
J
Jiaqi Liu 已提交
408 409 410 411 412 413 414
        impute_finished(bool, optional): If `True` and `decoder.tracks_own_finished`
            is False, then states get copied through for batch entries which are
            marked as finished, which differs with the unfinished using the new states
            returned by :code:`decoder.step()` and ensures that the final states have
            the correct values. Otherwise, states wouldn't be copied through when
            finished. If the returned `final_states` is needed, it should be set as
            True, which causes some slowdown. Default `False`.
415 416 417 418 419
        is_test(bool, optional): A flag indicating whether to use test mode. In
            test mode, it is more memory saving. Default `False`.
        return_length(bool, optional):  A flag indicating whether to return an
            extra Tensor variable in the output tuple, which stores the actual
            lengths of all decoded sequences. Default `False`.
420
        **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
421 422

    Returns:
423

Z
Zman 已提交
424 425 426 427 428 429 430 431 432 433 434
        - final_outputs (Tensor, nested structure of Tensor), each Tensor in :code:`final_outputs` is the stacked of all decoding steps' outputs, which might be revised
            by :code:`decoder.finalize()` if the decoder has implemented finalize.
            And :code:`final_outputs` has the same structure and data types as the :code:`outputs`
            returned by :code:`decoder.step()`

        - final_states (Tensor, nested structure of Tensor), :code:`final_states` is the counterpart at last time step of initial states \
            returned by :code:`decoder.initialize()` , thus has the same structure
            with it and has tensors with same shapes and data types.

        - sequence_lengths (Tensor), stores the actual lengths of all decoded sequences.
            sequence_lengths is provided only if :code:`return_length` is True.
435 436 437 438

    Examples:

        .. code-block:: python
439

440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
            import paddle
            from paddle.nn import BeamSearchDecoder, dynamic_decode
            from paddle.nn import GRUCell, Linear, Embedding
            trg_embeder = Embedding(100, 32)
            output_layer = Linear(32, 32)
            decoder_cell = GRUCell(input_size=32, hidden_size=32)
            decoder = BeamSearchDecoder(decoder_cell,
                                        start_token=0,
                                        end_token=1,
                                        beam_size=4,
                                        embedding_fn=trg_embeder,
                                        output_fn=output_layer)
            encoder_output = paddle.ones((4, 8, 32), dtype=paddle.get_default_dtype())
            outputs = dynamic_decode(decoder=decoder,
                                    inits=decoder_cell.get_initial_states(encoder_output),
                                    max_step_num=10)
    """
J
Jiabin Yang 已提交
457
    if _non_static_mode():
458 459 460 461 462 463 464 465 466 467
        return _dynamic_decode_imperative(
            decoder,
            inits,
            max_step_num,
            output_time_major,
            impute_finished,
            is_test,
            return_length,
            **kwargs
        )
468
    else:
469 470 471 472 473 474 475 476 477 478
        return _dynamic_decode_declarative(
            decoder,
            inits,
            max_step_num,
            output_time_major,
            impute_finished,
            is_test,
            return_length,
            **kwargs
        )
479 480


481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
def dynamic_lstm(
    input,
    size,
    h_0=None,
    c_0=None,
    param_attr=None,
    bias_attr=None,
    use_peepholes=True,
    is_reverse=False,
    gate_activation='sigmoid',
    cell_activation='tanh',
    candidate_activation='tanh',
    dtype='float32',
    name=None,
):
496
    r"""
S
swtkiwi 已提交
497

498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
    **Note**:
        1. This OP only supports LoDTensor as inputs. If you need to deal with Tensor, please use :ref:`api_fluid_layers_lstm` .
        2. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.

    The implementation of this OP include diagonal/peephole connections.
    Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
    If you do not need peephole connections, please set use_peepholes to False .

    This OP computes each timestep as follows:

    .. math::
      i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
    .. math::
      f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
    .. math::
      o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
    .. math::
      \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
    .. math::
      c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
    .. math::
      h_t = o_t \odot tanh(c_t)

    The symbolic meanings in the formula are as follows:

    - :math:`x_{t}` represents the input at timestep :math:`t`
    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
    - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
    - :math:`\widetilde{c_t}` represents the candidate cell state
    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension

    Parameters:
        input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
        size (int): must be 4 * hidden_size.
        h_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
                       Data type is float32 or float64. If set to None, it will be a vector of all 0. Default: None.
        c_0( :ref:`api_guide_Variable_en` , optional): The initial hidden state of the LSTM, multi-dimensional Tensor of shape :math:`[batch\_size, hidden\_size]` .
                       Data type is float32 or float64. If set to None, it will be a vector of all 0. `h_0` and `c_0` can be None but only at the same time. Default: None.
        param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
                              If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.

                              - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [hidden_size, 4*hidden_size].

        bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
                              weights, which contains two parts, input-hidden
                              bias weights and peephole connections weights if
                              setting `use_peepholes` to `True`.
                              Please refer to ref:`api_fluid_ParamAttr' . Default: None.

                              1. `use_peepholes = False`
                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
                                 - The shape is [1, 4*hidden_size].
                              2. `use_peepholes = True`
                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                 W_{fc}, W_{oc}`}.
                                 - The shape is [1, 7*hidden_size].
558

559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
        use_peepholes (bool, optional): Whether to use peephole connection or not. Default: True.
        is_reverse (bool, optional): Whether to calculate reverse LSTM. Default: False.
        gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default: "sigmoid".
        cell_activation (str, optional): The activation for cell output. Default: "tanh".
        candidate_activation (str, optional): The activation for candidate hidden state. Default: "tanh".
        dtype (str, optional): Data type, can be "float32" or "float64". Default: "float32".
        name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.

    Returns:
        tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :

            The hidden state and cell state of LSTM

                - hidden: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.
                - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.

    Examples:
        .. code-block:: python
577

578 579 580 581
            import paddle.fluid as fluid
            emb_dim = 256
            vocab_size = 10000
            hidden_dim = 512
582

583 584 585 586 587 588 589 590 591 592 593
            data = fluid.data(name='x', shape=[None], dtype='int64', lod_level=1)
            emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)

            forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4,
                                           bias_attr=False)

            forward, cell = fluid.layers.dynamic_lstm(
                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
            forward.shape  # (-1, 512)
            cell.shape  # (-1, 512)
    """
594 595 596 597 598 599 600 601 602 603
    assert (
        _non_static_mode() is not True
    ), "please use lstm instead of dynamic_lstm in dygraph mode!"
    assert (
        bias_attr is not False
    ), "bias_attr should not be False in dynamic_lstm."

    check_variable_and_dtype(
        input, 'input', ['float32', 'float64'], 'dynamic_lstm'
    )
604 605 606

    check_type(h_0, 'h_0', (Variable, type(None)), 'dynamic_lstm')
    if isinstance(h_0, Variable):
607 608 609
        check_variable_and_dtype(
            h_0, 'h_0', ['float32', 'float64'], 'dynamic_lstm'
        )
610 611 612

    check_type(c_0, 'c_0', (Variable, type(None)), 'dynamic_lstm')
    if isinstance(c_0, Variable):
613 614 615
        check_variable_and_dtype(
            c_0, 'c_0', ['float32', 'float64'], 'dynamic_lstm'
        )
616

617 618
    helper = LayerHelper('lstm', **locals())
    size = size // 4
619 620 621
    weight = helper.create_parameter(
        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype
    )
622 623 624
    bias_size = [1, 7 * size]
    if not use_peepholes:
        bias_size[1] = 4 * size
625 626 627
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True
    )
628 629 630 631 632 633 634 635

    hidden = helper.create_variable_for_type_inference(dtype)
    cell = helper.create_variable_for_type_inference(dtype)
    batch_gate = helper.create_variable_for_type_inference(dtype)
    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
    batch_size = input.shape[0]
    if h_0:
636
        assert h_0.shape == (batch_size, size), (
637
            'The shape of h0 should be (batch_size, %d)' % size
638
        )
639 640
        inputs['H0'] = h_0
    if c_0:
641
        assert c_0.shape == (batch_size, size), (
642
            'The shape of c0 should be (batch_size, %d)' % size
643
        )
644 645
        inputs['C0'] = c_0

646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
    helper.append_op(
        type='lstm',
        inputs=inputs,
        outputs={
            'Hidden': hidden,
            'Cell': cell,
            'BatchGate': batch_gate,
            'BatchCellPreAct': batch_cell_pre_act,
        },
        attrs={
            'use_peepholes': use_peepholes,
            'is_reverse': is_reverse,
            'gate_activation': gate_activation,
            'cell_activation': cell_activation,
            'candidate_activation': candidate_activation,
        },
    )
663 664 665
    return hidden, cell


666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
@deprecated(
    since='2.0.0',
    update_to='paddle.nn.LSTM',
    reason="This API may occur CUDNN errors.",
)
def lstm(
    input,
    init_h,
    init_c,
    max_len,
    hidden_size,
    num_layers,
    dropout_prob=0.0,
    is_bidirec=False,
    is_test=False,
    name=None,
    default_initializer=None,
    seed=-1,
):
685
    r"""
S
swtkiwi 已提交
686

687 688 689
    **Note**:
        This OP only supports running on GPU devices.

690
    This OP implements LSTM operation - `Hochreiter, S., & Schmidhuber, J. (1997) <https://blog.xpgreat.com/file/lstm.pdf>`_ .
691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726

    The implementation of this OP does not include diagonal/peephole connections.
    Please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
    If you need peephole connections, please use :ref:`api_fluid_layers_dynamic_lstm` .

    This OP computes each timestep as follows:

    .. math::
      i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_{x_i} + b_{h_i})
    .. math::
      f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_{x_f} + b_{h_f})
    .. math::
      o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_{x_o} + b_{h_o})
    .. math::
      \widetilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + b{x_c} + b_{h_c})
    .. math::
      c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
    .. math::
      h_t = o_t \odot tanh(c_t)

    The symbolic meanings in the formula are as follows:

    - :math:`x_{t}` represents the input at timestep :math:`t`
    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
    - :math:`h_{t-1}, c_{t-1}` represent the hidden state and cell state at timestep :math:`t-1` , respectively
    - :math:`\widetilde{c_t}` represents the candidate cell state
    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension

    Parameters:
        input ( :ref:`api_guide_Variable_en` ): LSTM input tensor, 3-D Tensor of shape :math:`[batch\_size, seq\_len, input\_dim]` . Data type is float32 or float64
        init_h( :ref:`api_guide_Variable_en` ): The initial hidden state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
                       If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
G
GaoWei8 已提交
727
        max_len (int): This parameter has no effect and will be discarded.
728 729 730 731 732 733 734 735 736 737 738 739
        init_c( :ref:`api_guide_Variable_en` ): The initial cell state of the LSTM, 3-D Tensor of shape :math:`[num\_layers, batch\_size, hidden\_size]` .
                       If is_bidirec = True, shape should be :math:`[num\_layers*2, batch\_size, hidden\_size]` . Data type is float32 or float64.
        hidden_size (int): hidden size of the LSTM.
        num_layers (int): total layers number of the LSTM.
        dropout_prob(float, optional): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
                             There is NO dropout work on rnn output of the last RNN layers.
                             Default: 0.0.
        is_bidirec (bool, optional): If it is bidirectional. Default: False.
        is_test (bool, optional): If it is in test phrase. Default: False.
        name (str, optional): A name for this layer. If set None, the layer
                         will be named automatically. Default: None.
        default_initializer(Initializer, optional): Where use initializer to initialize the Weight
T
tianshuo78520a 已提交
740
                         If set None, default initializer will be used. Default: None.
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758
        seed(int, optional): Seed for dropout in LSTM, If it's -1, dropout will use random seed. Default: 1.

    Returns:
        tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ) :

                        Three tensors, rnn_out, last_h, last_c:

                        - rnn_out is result of LSTM hidden, shape is :math:`[seq\_len, batch\_size, hidden\_size]` \
                          if is_bidirec set to True, shape will be :math:`[seq\_len, batch\_size, hidden\_size*2]`
                        - last_h is the hidden state of the last step of LSTM \
                          shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
                          if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`
                        - last_c(Tensor): the cell state of the last step of LSTM \
                          shape is :math:`[num\_layers, batch\_size, hidden\_size]` \
                          if is_bidirec set to True, shape will be :math:`[num\_layers*2, batch\_size, hidden\_size]`

    Examples:
        .. code-block:: python
759

760
            import paddle
761 762
            import paddle.fluid as fluid
            import paddle.fluid.layers as layers
763
            paddle.enable_static()
764 765 766 767 768

            emb_dim = 256
            vocab_size = 10000
            data = fluid.data(name='x', shape=[None, 100], dtype='int64')
            emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
769
            batch_size = 100
770 771 772 773
            dropout_prob = 0.2
            input_size = 100
            hidden_size = 150
            num_layers = 1
774
            max_len = 12
775 776 777 778 779 780 781 782 783 784 785
            init_h = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
            init_c = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
            rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \
                    max_len, hidden_size, num_layers, \
                    dropout_prob=dropout_prob)
            rnn_out.shape  # (-1, 100, 150)
            last_h.shape  # (1, 20, 150)
            last_c.shape  # (1, 20, 150)
    """

    helper = LayerHelper('cudnn_lstm', **locals())
X
Xing Wu 已提交
786 787 788 789 790 791
    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'lstm')
    check_variable_and_dtype(init_h, 'init_h', ['float32', 'float64'], 'lstm')
    check_variable_and_dtype(init_c, 'init_c', ['float32', 'float64'], 'lstm')
    check_type(max_len, 'max_len', (int), 'lstm')
    check_type(hidden_size, 'hidden_size', (int), 'lstm')
    check_type(num_layers, 'num_layers', (int), 'lstm')
792 793 794 795
    dtype = input.dtype
    input_shape = list(input.shape)
    input_size = input_shape[-1]
    weight_size = 0
G
GaoWei8 已提交
796 797
    num_dirrection = 2 if is_bidirec == True else 1

798 799
    for i in range(num_layers):
        if i == 0:
G
GaoWei8 已提交
800
            input_weight_size = (input_size * hidden_size) * 4 * num_dirrection
801
        else:
G
GaoWei8 已提交
802 803
            input_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
        hidden_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
804

G
GaoWei8 已提交
805 806
        weight_size += input_weight_size + hidden_weight_size
        weight_size += hidden_size * 8 * num_dirrection
807

808 809 810 811 812 813
    weight = helper.create_parameter(
        attr=helper.param_attr,
        shape=[weight_size],
        dtype=dtype,
        default_initializer=default_initializer,
    )
814 815 816 817

    out = helper.create_variable_for_type_inference(dtype)
    last_h = helper.create_variable_for_type_inference(dtype)
    last_c = helper.create_variable_for_type_inference(dtype)
G
GaoWei8 已提交
818
    reserve = helper.create_variable_for_type_inference(
819 820
        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
    )
G
GaoWei8 已提交
821
    state_out = helper.create_variable_for_type_inference(
822 823
        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
    )
G
GaoWei8 已提交
824
    state_out.persistable = True
825

826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
    helper.append_op(
        type='cudnn_lstm',
        inputs={
            'Input': input,
            'InitH': init_h,
            'InitC': init_c,
            'W': weight,
        },
        outputs={
            'Out': out,
            'LastH': last_h,
            'LastC': last_c,
            'Reserve': reserve,
            'StateOut': state_out,
        },
        attrs={
            'is_bidirec': is_bidirec,
            'input_size': input_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'is_test': is_test,
            'dropout_prob': dropout_prob,
            'seed': seed,
        },
    )
851 852 853
    return out, last_h, last_c


854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
def dynamic_lstmp(
    input,
    size,
    proj_size,
    param_attr=None,
    bias_attr=None,
    use_peepholes=True,
    is_reverse=False,
    gate_activation='sigmoid',
    cell_activation='tanh',
    candidate_activation='tanh',
    proj_activation='tanh',
    dtype='float32',
    name=None,
    h_0=None,
    c_0=None,
    cell_clip=None,
    proj_clip=None,
):
873
    r"""
S
swtkiwi 已提交
874

875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
    **Note**:
        1. In order to improve efficiency, users must first map the input of dimension [T, hidden_size] to input of [T, 4 * hidden_size], and then pass it to this OP.

    This OP implements the LSTMP (LSTM Projected) layer.
    The LSTMP layer has a separate linear mapping layer behind the LSTM layer. -- `Sak, H., Senior, A., & Beaufays, F. (2014) <https://ai.google/research/pubs/pub43905.pdf>`_ .

    Compared with the standard LSTM layer, LSTMP has an additional linear mapping layer,
    which is used to map from the original hidden state :math:`h_t` to the lower dimensional state :math:`r_t` .
    This reduces the total number of parameters and computational complexity, especially when the output unit is relatively large.

    The default implementation of the OP contains diagonal/peephole connections,
    please refer to `Gers, F. A., & Schmidhuber, J. (2000) <ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf>`_ .
    If you need to disable the peephole connections, set use_peepholes to False.

    This OP computes each timestep as follows:

    .. math::
      i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
    .. math::
          f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
    .. math::
          o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_{t-1} + b_o)
    .. math::
          \widetilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
    .. math::
          c_t = f_t \odot c_{t-1} + i_t \odot \widetilde{c_t}
    .. math::
          h_t = o_t \odot act_h(c_t)
    .. math::
          r_t = \overline{act_h}(W_{rh}h_t)

    The symbolic meanings in the formula are as follows:

    - :math:`x_{t}` represents the input at timestep :math:`t`
    - :math:`h_{t}` represents the hidden state at timestep :math:`t`
    - :math:`r_{t}` : represents the state of the projected output of the hidden state :math:`h_{t}`
    - :math:`h_{t-1}, c_{t-1}, r_{t-1}` represent the hidden state, cell state and projected output at timestep :math:`t-1` , respectively
    - :math:`\widetilde{c_t}` represents the candidate cell state
    - :math:`i_t` , :math:`f_t` and :math:`o_t` represent input gate, forget gate, output gate, respectively
    - :math:`W` represents weight (e.g., :math:`W_{ix}` is the weight of a linear transformation of input :math:`x_{t}` when calculating input gate :math:`i_t` )
    - :math:`b` represents bias (e.g., :math:`b_{i}` is the bias of input gate)
    - :math:`\sigma` represents nonlinear activation function for gate, default sigmoid
    - :math:`\odot` represents the Hadamard product of a matrix, i.e. multiplying the elements of the same position for two matrices with the same dimension to get another matrix with the same dimension

    Parameters:
        input( :ref:`api_guide_Variable_en` ): The input of dynamic_lstmp layer, which supports
                         variable-time length input sequence.
                         It is a multi-dimensional LODTensor of shape :math:`[T, 4*hidden\_size]` . Data type is float32 or float64.
        size(int): must be 4 * hidden_size.
        proj_size(int): The size of projection output.
        param_attr(ParamAttr, optional): Parameter attribute of weight. If it is None, the default weight parameter attribute is used. Please refer to ref:`api_fluid_ParamAttr' .
                              If the user needs to set this parameter, the dimension must be :math:`[hidden\_size, 4*hidden\_size]` . Default: None.

                              - Weights = :math:`\{ W_{cr},W_{ir},W_{fr},W_{or} \}` , the shape is [P, 4*hidden_size] , where P is the projection size.
                              - Projection weight  = :math:`\{ W_{rh} \}` , the shape is [hidden_size, P].

        bias_attr (ParamAttr, optional): The bias attribute for the learnable bias
                              weights, which contains two parts, input-hidden
                              bias weights and peephole connections weights if
                              setting `use_peepholes` to `True`.
                              Please refer to ref:`api_fluid_ParamAttr' . Default: None.

                              1. `use_peepholes = False`
                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
                                 - The shape is [1, 4*hidden_size].
                              2. `use_peepholes = True`
                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                 W_{fc}, W_{oc}`}.
                                 - The shape is [1, 7*hidden_size].

        use_peepholes (bool, optional): Whether to use peephole connection or not. Default True.
        is_reverse (bool, optional): Whether to calculate reverse LSTM. Default False.
        gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default "sigmoid".
        cell_activation (str, optional): The activation for cell output. Default "tanh".
        candidate_activation (str, optional): The activation for candidate hidden state. Default "tanh".
        proj_activation(str, optional): The activation for projection output. Default "tanh".
        dtype (str, optional): Data type, can be "float32" or "float64". Default "float32".
        name (str, optional): A name for this layer. Please refer to :ref:`api_guide_Name` . Default: None.
        h_0( :ref:`api_guide_Variable` , optional): The initial hidden state is an optional input, default is zero.
                       This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size. Default: None.
        c_0( :ref:`api_guide_Variable` , optional): The initial cell state is an optional input, default is zero.
                       This is a tensor with shape :math:`[batch\_size, P]` , where P is the projection size.
                       `h_0` and `c_0` can be None but only at the same time. Default: None.
        cell_clip(float, optional): If not None, the cell state is clipped
                             by this value prior to the cell output activation. Default: None.
        proj_clip(float, optional): If `num_proj > 0` and `proj_clip` is
                            provided, then the projected values are clipped elementwise to within
                            `[-proj_clip, proj_clip]`. Default: None.

    Returns:
        tuple ( :ref:`api_guide_Variable` , :ref:`api_guide_Variable` ) :

                The hidden state and cell state of LSTMP

                - hidden: LoDTensor with shape of :math:`[T, P]` , and its lod and dtype is the same as the input.
                - cell: LoDTensor with shape of :math:`[T, hidden\_size]` , and its lod and dtype is the same as the input.

    Examples:

        .. code-block:: python

            import paddle.fluid as fluid
            dict_dim, emb_dim = 128, 64
            data = fluid.data(name='sequence', shape=[None], dtype='int64', lod_level=1)
            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
            hidden_dim, proj_dim = 512, 256
            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
                                    act=None, bias_attr=None)
            proj_out, last_c = fluid.layers.dynamic_lstmp(input=fc_out,
                                                    size=hidden_dim * 4,
                                                    proj_size=proj_dim,
                                                    use_peepholes=False,
                                                    is_reverse=True,
                                                    cell_activation="tanh",
                                                    proj_activation="tanh")
            proj_out.shape  # (-1, 256)
            last_c.shape  # (-1, 512)
    """

994 995 996
    assert (
        _non_static_mode() is not True
    ), "please use lstm instead of dynamic_lstmp in dygraph mode!"
997

998 999 1000
    assert (
        bias_attr is not False
    ), "bias_attr should not be False in dynamic_lstmp."
1001

1002 1003 1004
    check_variable_and_dtype(
        input, 'input', ['float32', 'float64'], 'dynamic_lstmp'
    )
1005 1006 1007

    check_type(h_0, 'h_0', (Variable, type(None)), 'dynamic_lstmp')
    if isinstance(h_0, Variable):
1008 1009 1010
        check_variable_and_dtype(
            h_0, 'h_0', ['float32', 'float64'], 'dynamic_lstmp'
        )
1011 1012 1013

    check_type(c_0, 'c_0', (Variable, type(None)), 'dynamic_lstmp')
    if isinstance(c_0, Variable):
1014 1015 1016
        check_variable_and_dtype(
            c_0, 'c_0', ['float32', 'float64'], 'dynamic_lstmp'
        )
1017

1018 1019
    helper = LayerHelper('lstmp', **locals())
    size = size // 4
1020 1021 1022 1023 1024 1025
    weight = helper.create_parameter(
        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype
    )
    proj_weight = helper.create_parameter(
        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype
    )
1026 1027 1028
    bias_size = [1, 7 * size]
    if not use_peepholes:
        bias_size[1] = 4 * size
1029 1030 1031
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True
    )
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042

    projection = helper.create_variable_for_type_inference(dtype)
    cell = helper.create_variable_for_type_inference(dtype)
    ordered_proj0 = helper.create_variable_for_type_inference(dtype)
    batch_hidden = helper.create_variable_for_type_inference(dtype)
    batch_gate = helper.create_variable_for_type_inference(dtype)
    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
    inputs = {
        'Input': input,
        'Weight': weight,
        'ProjWeight': proj_weight,
1043
        'Bias': bias,
1044 1045 1046
    }
    batch_size = input.shape[0]
    if h_0:
1047
        assert h_0.shape == (batch_size, proj_size), (
1048
            'The shape of h0 should be (batch_size, %d)' % proj_size
1049
        )
1050 1051
        inputs['H0'] = h_0
    if c_0:
1052
        assert c_0.shape == (batch_size, size), (
1053
            'The shape of c0 should be (batch_size, %d)' % size
1054
        )
1055 1056 1057
        inputs['C0'] = c_0

    if cell_clip:
T
tianshuo78520a 已提交
1058
        assert cell_clip >= 0, "cell_clip should not be negative."
1059
    if proj_clip:
T
tianshuo78520a 已提交
1060
        assert proj_clip >= 0, "proj_clip should not be negative."
1061

1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082
    helper.append_op(
        type='lstmp',
        inputs=inputs,
        outputs={
            'Projection': projection,
            'Cell': cell,
            'BatchHidden': batch_hidden,
            'BatchGate': batch_gate,
            'BatchCellPreAct': batch_cell_pre_act,
        },
        attrs={
            'use_peepholes': use_peepholes,
            'cell_clip': cell_clip,
            'proj_clip': proj_clip,
            'is_reverse': is_reverse,
            'gate_activation': gate_activation,
            'cell_activation': cell_activation,
            'candidate_activation': candidate_activation,
            'proj_activation': proj_activation,
        },
    )
1083 1084 1085
    return projection, cell


1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
def dynamic_gru(
    input,
    size,
    param_attr=None,
    bias_attr=None,
    is_reverse=False,
    gate_activation='sigmoid',
    candidate_activation='tanh',
    h_0=None,
    origin_mode=False,
):
1097
    r"""
S
swtkiwi 已提交
1098

1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
    **Note: The input type of this must be LoDTensor. If the input type to be
    processed is Tensor, use** :ref:`api_fluid_layers_StaticRNN` .

    This operator is used to perform the calculations for a single layer of
    Gated Recurrent Unit (GRU) on full sequences step by step. The calculations
    in one time step support these two modes:

    If ``origin_mode`` is True, then the formula used is from paper
    `Learning Phrase Representations using RNN Encoder Decoder for Statistical
    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .

    .. math::

        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)

        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)

        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)

        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}


    if ``origin_mode`` is False, then the formula used is from paper
    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
    Modeling  <https://arxiv.org/pdf/1412.3555.pdf>`_

    .. math::

        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)

        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)

        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)

        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}

    :math:`x_t` is the input of current time step, but it is not from ``input`` .
    This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
    **Note** thus a fully-connect layer whose size is 3 times of ``size`` should
    be used before this operator, and the output should be used as ``input`` here.
1139
    :math:`h_{t-1}` is the hidden state from previous time step.
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
    :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
    update gate, reset gate, candidate hidden and hidden output separately.
    :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
    the weight matrix and bias used in update gate, reset gate, candidate hidden
    calculations. For implementation, the three weight matrix are merged into a
    tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
    a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
    hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
    are concatenated with shape :math:`[D, D  \\times 2]` lying on the first part,
    and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .


    Args:
        input(Variable): A LoDTensor whose lod level is 1, representing the input
            after linear projection. Its shape should be :math:`[T, D \\times 3]` ,
            where :math:`T` stands for the total sequence lengths in this mini-batch,
            :math:`D` for the hidden size. The data type should be float32 or float64.
        size(int): Indicate the hidden size.
        param_attr(ParamAttr, optional):  To specify the weight parameter property.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :ref:`api_fluid_ParamAttr` .
        bias_attr (ParamAttr, optional): To specify the bias parameter property.
            Default: None, which means the default bias parameter property is used.
            See usage for details in :ref:`api_fluid_ParamAttr` .
        is_reverse(bool, optional): Whether to compute in the reversed order of
            input sequences. Default False.
T
tianshuo78520a 已提交
1166
        gate_activation(str, optional): The activation function corresponding to
1167 1168
            :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
            are supported. Default "sigmoid".
T
tianshuo78520a 已提交
1169
        candidate_activation(str, optional): The activation function corresponding to
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
            :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
            are supported. Default "tanh".
        h_0 (Variable, optional): A Tensor representing the initial hidden state.
            It not provided, the default initial hidden state is 0. The shape is
            :math:`[N, D]` , where :math:`N` is the number of sequences in the
            mini-batch, :math:`D` for the hidden size. The data type should be
            same as ``input`` . Default None.

    Returns:
        Variable: A LoDTensor whose lod level is 1 and shape is :math:`[T, D]` , \
            where :math:`T` stands for the total sequence lengths in this mini-batch \
            :math:`D` for the hidden size. It represents GRU transformed sequence output, \
            and has the same lod and data type with ``input`` .

    Examples:

        .. code-block:: python

            import paddle.fluid as fluid

            dict_dim, emb_dim = 128, 64
            data = fluid.data(name='sequence',
                      shape=[None],
                      dtype='int64',
                      lod_level=1)
            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
            hidden_dim = 512
            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
            hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
    """

1201 1202 1203
    assert (
        _non_static_mode() is not True
    ), "please use gru instead of dynamic_gru in dygraph mode!"
1204

1205 1206 1207
    check_variable_and_dtype(
        input, 'input', ['float32', 'float64'], 'dynamic_gru'
    )
1208 1209 1210

    check_type(h_0, 'h_0', (Variable, type(None)), 'dynamic_gru')
    if isinstance(h_0, Variable):
1211 1212 1213
        check_variable_and_dtype(
            h_0, 'h_0', ['float32', 'float64'], 'dynamic_gru'
        )
1214

1215 1216 1217
    helper = LayerHelper('gru', **locals())
    dtype = helper.input_dtype()

1218 1219 1220 1221 1222 1223
    weight = helper.create_parameter(
        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype
    )
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True
    )
1224 1225 1226
    batch_size = input.shape[0]
    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
    if h_0:
1227 1228 1229
        assert h_0.shape == (batch_size, size), (
            'The shape of h0 should be(batch_size, %d)' % size
        )
1230 1231 1232 1233 1234 1235 1236
        inputs['H0'] = h_0

    hidden = helper.create_variable_for_type_inference(dtype)
    batch_gate = helper.create_variable_for_type_inference(dtype)
    batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
    batch_hidden = helper.create_variable_for_type_inference(dtype)

1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
    helper.append_op(
        type='gru',
        inputs=inputs,
        outputs={
            'Hidden': hidden,
            'BatchGate': batch_gate,
            'BatchResetHiddenPrev': batch_reset_hidden_prev,
            'BatchHidden': batch_hidden,
        },
        attrs={
            'is_reverse': is_reverse,
            'gate_activation': gate_activation,
            'activation': candidate_activation,
            'origin_mode': origin_mode,
        },
    )
1253 1254 1255
    return hidden


1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
def gru_unit(
    input,
    hidden,
    size,
    param_attr=None,
    bias_attr=None,
    activation='tanh',
    gate_activation='sigmoid',
    origin_mode=False,
):
1266
    r"""
S
swtkiwi 已提交
1267

1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303
    Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
    one time step and it supports these two modes:

    If ``origin_mode`` is True, then the formula used is from paper
    `Learning Phrase Representations using RNN Encoder Decoder for Statistical
    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .

    .. math::

        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)

        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)

        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)

        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}


    if ``origin_mode`` is False, then the formula used is from paper
    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
    Modeling  <https://arxiv.org/pdf/1412.3555.pdf>`_

    .. math::

        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)

        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)

        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)

        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}

    :math:`x_t` is the input of current time step, but it is not ``input`` .
    This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
    **Note** thus a fully-connect layer whose size is 3 times of GRU hidden size should
    be used before this operator, and the output should be used as ``input`` here.
1304
    :math:`h_{t-1}` is the hidden state from previous time step.
1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331
    :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
    update gate, reset gate, candidate hidden and hidden output separately.
    :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
    the weight matrix and bias used in update gate, reset gate, candidate hidden
    calculations. For implementation, the three weight matrix are merged into a
    tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
    a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
    hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
    are concatenated with shape :math:`[D, D  \\times 2]` lying on the first part,
    and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .


    Args:
        input(Variable): A 2D Tensor representing the input after linear projection
            after linear projection. Its shape should be :math:`[N, D \\times 3]` ,
            where :math:`N` stands for batch size, :math:`D` for the hidden size.
            The data type should be float32 or float64.
        hidden(Variable): A 2D Tensor representing the hidden state from previous step.
            Its shape should be :math:`[N, D]` , where :math:`N` stands for batch size,
            :math:`D` for the hidden size. The data type should be same as ``input`` .
        size(int): Indicate the hidden size.
        param_attr(ParamAttr, optional):  To specify the weight parameter property.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :ref:`api_fluid_ParamAttr` .
        bias_attr (ParamAttr, optional): To specify the bias parameter property.
            Default: None, which means the default bias parameter property is used.
            See usage for details in :ref:`api_fluid_ParamAttr` .
T
tianshuo78520a 已提交
1332
        activation(str, optional): The activation function corresponding to
1333 1334
            :math:`act_c` in the formula. "sigmoid", "tanh", "relu" and "identity"
            are supported. Default "tanh".
T
tianshuo78520a 已提交
1335
        gate_activation(str, optional): The activation function corresponding to
1336 1337 1338 1339 1340 1341
            :math:`act_g` in the formula. "sigmoid", "tanh", "relu" and "identity"
            are supported. Default "sigmoid".

    Returns:
        tuple: The tuple contains three Tensor variables with the same data type \
            as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \
T
tianshuo78520a 已提交
1342
            reset previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364
            concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \
            :math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \
            Usually only the hidden state for next time step ( :math:`h_t` ) is used \
            as output and state, the other two are intermediate results of calculations.

    Examples:

        .. code-block:: python

            import paddle.fluid as fluid

            dict_dim, emb_dim = 128, 64
            data = fluid.data(name='step_data', shape=[None], dtype='int64')
            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
            hidden_dim = 512
            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
            pre_hidden = fluid.data(
                name='pre_hidden', shape=[None, hidden_dim], dtype='float32')
            hidden = fluid.layers.gru_unit(
                input=x, hidden=pre_hidden, size=hidden_dim * 3)

    """
X
Xing Wu 已提交
1365
    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'gru_unit')
1366 1367 1368
    check_variable_and_dtype(
        hidden, 'hidden', ['float32', 'float64'], 'gru_unit'
    )
X
Xing Wu 已提交
1369
    check_type(size, 'size', (int), 'gru_unit')
1370 1371 1372 1373
    activation_dict = dict(
        identity=0,
        sigmoid=1,
        tanh=2,
1374 1375
        relu=3,
    )
1376 1377 1378 1379 1380 1381 1382 1383
    activation = activation_dict[activation]
    gate_activation = activation_dict[gate_activation]

    helper = LayerHelper('gru_unit', **locals())
    dtype = helper.input_dtype()
    size = size // 3

    # create weight
1384 1385 1386
    weight = helper.create_parameter(
        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype
    )
1387 1388 1389 1390 1391 1392 1393 1394

    gate = helper.create_variable_for_type_inference(dtype)
    reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
    updated_hidden = helper.create_variable_for_type_inference(dtype)
    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
    # create bias
    if helper.bias_attr:
        bias_size = [1, 3 * size]
1395 1396 1397
        bias = helper.create_parameter(
            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True
        )
1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410
        inputs['Bias'] = bias

    helper.append_op(
        type='gru_unit',
        inputs=inputs,
        outputs={
            'Gate': gate,
            'ResetHiddenPrev': reset_hidden_pre,
            'Hidden': updated_hidden,
        },
        attrs={
            'activation': 2,  # tanh
            'gate_activation': 1,  # sigmoid
1411 1412 1413
            'origin_mode': origin_mode,
        },
    )
1414 1415

    return updated_hidden, reset_hidden_pre, gate