diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 0afcd21b639629eea547af95b420a69efbb250cc..d5fe1630813caf41596332079c421d129520a31e 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -36,503 +36,15 @@ from ..data_feeder import check_variable_and_dtype, check_type, check_dtype from collections.abc import Sequence __all__ = [ - 'RNNCell', - 'GRUCell', - 'LSTMCell', - 'rnn', - 'birnn', 'dynamic_decode', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', 'gru_unit', - 'lstm_unit', 'lstm', ] -class RNNCell: - """ - :api_attr: Static Graph - - RNNCell is the base class for abstraction representing the calculations - mapping the input and state to the output and new state. It is suitable to - and mostly used in RNN. - """ - - def call(self, inputs, states, **kwargs): - r""" - Every cell must implement this method to do the calculations mapping the - inputs and states to the output and new states. - - To be more flexible, both inputs and states can be a tensor variable or - a nested structure (list|tuple|namedtuple|dict) of tensor variable, that - is, a (possibly nested structure of) tensor variable[s]. - - Parameters: - inputs: A (possibly nested structure of) tensor variable[s]. - states: A (possibly nested structure of) tensor variable[s]. - **kwargs: Additional keyword arguments, provided by the caller. - - Returns: - tuple: outputs and new_states pair. outputs and new_states both \ - can be nested structure of tensor variables. new_states must \ - have the same structure with states. - - """ - raise NotImplementedError("RNNCell must implent the call function.") - - def __call__(self, inputs, states, **kwargs): - return self.call(inputs, states, **kwargs) - - def get_initial_states( - self, - batch_ref, - shape=None, - dtype='float32', - init_value=0, - batch_dim_idx=0, - ): - r""" - Generate initialized states according to provided shape, data type and - value. - - Parameters: - batch_ref: A (possibly nested structure of) tensor variable[s]. - The first dimension of the tensor will be used as batch size to - initialize states. - shape: A (possibly nested structure of) shape[s], where a shape is - represented as a list/tuple of integer). -1(for batch size) will - beautomatically inserted if shape is not started with it. If None, - property `state_shape` will be used. The default value is None. - dtype: A (possibly nested structure of) data type[s]. The structure - must be same as that of `shape`, except when all tensors' in states - has the same data type, a single data type can be used. If - property `cell.state_shape` is not available, float32 will be used - as the data type. The default value is float32. - init_value: A float value used to initialize states. - batch_dim_idx: An integer indicating which dimension of the tensor in - inputs represents batch size. The default value is 0. - - Returns: - Variable: tensor variable[s] packed in the same structure provided \ - by shape, representing the initialized states. - """ - check_variable_and_dtype( - batch_ref, - 'batch_ref', - ['float32', 'float64', 'int32', 'int64'], - 'RNNCell', - ) - check_type(shape, 'shape', (list, tuple, type(None), int), 'RNNCell') - if isinstance(shape, (list, tuple)): - shapes = map_structure(lambda x: x, shape) - if isinstance(shape, list): - for i, _shape in enumerate(shapes): - check_type(_shape, 'shapes[' + str(i) + ']', int, 'RNNCell') - else: - check_type(shapes, 'shapes', int, 'RNNCell') - check_dtype(dtype, 'dtype', ['float32', 'float64'], 'RNNCell') - - # TODO: use inputs and batch_size - batch_ref = flatten(batch_ref)[0] - - def _is_shape_sequence(seq): - """For shape, list/tuple of integer is the finest-grained objection""" - if isinstance(seq, list) or isinstance(seq, tuple): - if reduce( - lambda flag, x: isinstance(x, int) and flag, seq, True - ): - return False - # TODO: Add check for the illegal - if isinstance(seq, dict): - return True - return isinstance(seq, Sequence) and not isinstance(seq, str) - - class Shape: - def __init__(self, shape): - self.shape = shape if shape[0] == -1 else ([-1] + list(shape)) - - # nested structure of shapes - states_shapes = self.state_shape if shape is None else shape - is_sequence_ori = utils.is_sequence - utils.is_sequence = _is_shape_sequence - states_shapes = map_structure(lambda shape: Shape(shape), states_shapes) - utils.is_sequence = is_sequence_ori - - # nested structure of dtypes - try: - states_dtypes = self.state_dtype if dtype is None else dtype - except NotImplementedError: # use fp32 as default - states_dtypes = "float32" - if len(flatten(states_dtypes)) == 1: - dtype = flatten(states_dtypes)[0] - states_dtypes = map_structure(lambda shape: dtype, states_shapes) - - init_states = map_structure( - lambda shape, dtype: tensor.fill_constant_batch_size_like( - input=batch_ref, - shape=shape.shape, - dtype=dtype, - value=init_value, - input_dim_idx=batch_dim_idx, - ), - states_shapes, - states_dtypes, - ) - return init_states - - @property - def state_shape(self): - """ - Abstract method (property). - Used to initialize states. - A (possibly nested structure of) shape[s], where a shape is represented - as a list/tuple of integers (-1 for batch size would be automatically - inserted into a shape if shape is not started with it). - Not necessary to be implemented if states are not initialized by - `get_initial_states` or the `shape` argument is provided when using - `get_initial_states`. - """ - raise NotImplementedError( - "Please add implementaion for `state_shape` in the used cell." - ) - - @property - def state_dtype(self): - """ - Abstract method (property). - Used to initialize states. - A (possibly nested structure of) data types[s]. The structure must be - same as that of `shape`, except when all tensors' in states has the same - data type, a single data type can be used. - Not necessary to be implemented if states are not initialized - by `get_initial_states` or the `dtype` argument is provided when using - `get_initial_states`. - """ - raise NotImplementedError( - "Please add implementaion for `state_dtype` in the used cell." - ) - - -class GRUCell(RNNCell): - r""" - :api_attr: Static Graph - - Gated Recurrent Unit cell. It is a wrapper for - `fluid.contrib.layers.rnn_impl.BasicGRUUnit` to make it adapt to RNNCell. - - The formula used is as follow: - - .. math:: - - u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) - - r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) - - \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) - - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - - For more details, please refer to `Learning Phrase Representations using - RNN Encoder Decoder for Statistical Machine Translation `_ - - Examples: - - .. code-block:: python - - import paddle.fluid.layers as layers - cell = layers.GRUCell(hidden_size=256) - """ - - def __init__( - self, - hidden_size, - param_attr=None, - bias_attr=None, - gate_activation=None, - activation=None, - dtype="float32", - name="GRUCell", - ): - """ - Constructor of GRUCell. - - Parameters: - hidden_size (int): The hidden size in the GRU cell. - param_attr(ParamAttr, optional): The parameter attribute for the learnable - weight matrix. Default: None. - bias_attr (ParamAttr, optional): The parameter attribute for the bias - of GRU. Default: None. - gate_activation (function, optional): The activation function for :math:`act_g`. - Default: `fluid.layers.sigmoid`. - activation (function, optional): The activation function for :math:`act_c`. - Default: `fluid.layers.tanh`. - dtype(string, optional): The data type used in this cell. Default float32. - name(string, optional) : The name scope used to identify parameters and biases. - """ - check_type(hidden_size, 'hidden_size', (int), 'GRUCell') - check_dtype(dtype, 'dtype', ['float32', 'float64'], 'GRUCell') - self.hidden_size = hidden_size - from .. import contrib # TODO: resolve recurrent import - - self.gru_unit = contrib.layers.rnn_impl.BasicGRUUnit( - name, - hidden_size, - param_attr, - bias_attr, - gate_activation, - activation, - dtype, - ) - - def call(self, inputs, states): - r""" - Perform calculations of GRU. - - Parameters: - inputs(Variable): A tensor with shape `[batch_size, input_size]`, - corresponding to :math:`x_t` in the formula. The data type - should be float32 or float64. - states(Variable): A tensor with shape `[batch_size, hidden_size]`. - corresponding to :math:`h_{t-1}` in the formula. The data type - should be float32 or float64. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \ - `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \ - corresponding to :math:`h_t` in the formula. The data type of the \ - tensor is same as that of `states`. - """ - - check_variable_and_dtype( - inputs, 'inputs', ['float32', 'float64'], 'GRUCell' - ) - check_variable_and_dtype( - states, 'states', ['float32', 'float64'], 'GRUCell' - ) - new_hidden = self.gru_unit(inputs, states) - return new_hidden, new_hidden - - @property - def state_shape(self): - """ - The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch - size would be automatically inserted into shape). The shape corresponds - to :math:`h_{t-1}`. - """ - return [self.hidden_size] - - -class LSTMCell(RNNCell): - r""" - :api_attr: Static Graph - - Long-Short Term Memory cell. It is a wrapper for - `fluid.contrib.layers.rnn_impl.BasicLSTMUnit` to make it adapt to RNNCell. - - The formula used is as follow: - - .. math:: - - i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) - - f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) - - c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) - - o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) - - h_{t} & = o_{t} act_c (c_{t}) - - For more details, please refer to `RECURRENT NEURAL NETWORK REGULARIZATION `_ - - Examples: - - .. code-block:: python - - import paddle.fluid.layers as layers - cell = layers.LSTMCell(hidden_size=256) - """ - - def __init__( - self, - hidden_size, - param_attr=None, - bias_attr=None, - gate_activation=None, - activation=None, - forget_bias=1.0, - dtype="float32", - name="LSTMCell", - ): - """ - Constructor of LSTMCell. - - Parameters: - hidden_size (int): The hidden size in the LSTM cell. - param_attr(ParamAttr, optional): The parameter attribute for the learnable - weight matrix. Default: None. - bias_attr (ParamAttr, optional): The parameter attribute for the bias - of LSTM. Default: None. - gate_activation (function, optional): The activation function for :math:`act_g`. - Default: 'fluid.layers.sigmoid'. - activation (function, optional): The activation function for :math:`act_h`. - Default: 'fluid.layers.tanh'. - forget_bias(float, optional): forget bias used when computing forget gate. - Default 1.0 - dtype(string, optional): The data type used in this cell. Default float32. - name(string, optional) : The name scope used to identify parameters and biases. - """ - - check_type(hidden_size, 'hidden_size', (int), 'LSTMCell') - check_dtype(dtype, 'dtype', ['float32', 'float64'], 'LSTMCell') - self.hidden_size = hidden_size - from .. import contrib # TODO: resolve recurrent import - - self.lstm_unit = contrib.layers.rnn_impl.BasicLSTMUnit( - name, - hidden_size, - param_attr, - bias_attr, - gate_activation, - activation, - forget_bias, - dtype, - ) - - def call(self, inputs, states): - r""" - Perform calculations of LSTM. - - Parameters: - inputs(Variable): A tensor with shape `[batch_size, input_size]`, - corresponding to :math:`x_t` in the formula. The data type - should be float32 or float64. - states(Variable): A list of containing two tensors, each shaped - `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}` - in the formula. The data type should be float32 or float64. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula; `new_states` is a list containing \ - two tenser variables shaped `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}, c_{t}` in the formula. The data type of these \ - tensors all is same as that of `states`. - """ - - check_variable_and_dtype( - inputs, 'inputs', ['float32', 'float64'], 'LSTMCell' - ) - check_type(states, 'states', list, 'LSTMCell') - if isinstance(states, list): - for i, state in enumerate(states): - check_variable_and_dtype( - state, - 'state[' + str(i) + ']', - ['float32', 'float64'], - 'LSTMCell', - ) - - pre_hidden, pre_cell = states - new_hidden, new_cell = self.lstm_unit(inputs, pre_hidden, pre_cell) - return new_hidden, [new_hidden, new_cell] - - @property - def state_shape(self): - """ - The `state_shape` of LSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]` - (-1 for batch size would be automatically inserted into shape). These two - shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately. - """ - return [[self.hidden_size], [self.hidden_size]] - - -def rnn( - cell, - inputs, - initial_states=None, - sequence_length=None, - time_major=False, - is_reverse=False, - **kwargs -): - """ - rnn creates a recurrent neural network specified by RNNCell `cell`, - which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) - repeatedly until reaches to the maximum length of `inputs`. - - Arguments: - cell(RNNCellBase): An instance of `RNNCellBase`. - inputs(Tensor): the input sequences. - If time_major is True, the shape is - `[time_steps, batch_size, input_size]` - else the shape is `[batch_size, time_steps, input_size]`. - initial_states(Tensor|tuple|list, optional): the initial state of the - rnn cell. Tensor or a possibly nested structure of tensors. If not - provided, `cell.get_initial_states` would be called to produce - the initial state. Defaults to None. - sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 - or int32. The valid lengths of input sequences. Defaults to None. - If `sequence_length` is not None, the inputs are treated as - padded sequences. In each input sequence, elements whose time step - index are not less than the valid length are treated as paddings. - time_major (bool): Whether the first dimension of the input means the - time steps. Defaults to False. - is_reverse (bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Defaults to False. - **kwargs: Additional keyword arguments to pass to `forward` of the cell. - - Returns: - (outputs, final_states) - outputs (Tensor|list|tuple): the output sequence. Tensor or nested - structure of Tensors. - If `time_major` is True, the shape of each tensor in outpus is - `[time_steps, batch_size, hidden_size]`, else - `[batch_size, time_steps, hidden_size]`. - final_states (Tensor|list|tuple): final states. A (possibly nested structure of) - tensor[s], representing the final state for RNN. It has the same - structure of intial state. Each tensor in final states has the same - shape and dtype as the corresponding tensor in initial states. - - - Examples: - - .. code-block:: python - - import paddle - paddle.disable_static() - - cell = paddle.nn.SimpleRNNCell(16, 32) - - inputs = paddle.rand((4, 23, 16)) - prev_h = paddle.randn((4, 32)) - outputs, final_states = paddle.fluid.layers.rnn(cell, inputs, prev_h) - - """ - if _non_static_mode(): - return _rnn_dynamic_graph( - cell, - inputs, - initial_states, - sequence_length, - time_major, - is_reverse, - **kwargs - ) - else: - return _rnn_static_graph( - cell, - inputs, - initial_states, - sequence_length, - time_major, - is_reverse, - **kwargs - ) - - class ArrayWrapper: def __init__(self, x): self.array = [x] @@ -545,273 +57,6 @@ class ArrayWrapper: return self.array.__getitem__(item) -def _maybe_copy(state, new_state, step_mask): - """update rnn state or just pass the old state through""" - new_state = paddle.tensor.math._multiply_with_axis( - new_state, step_mask, axis=0 - ) + paddle.tensor.math._multiply_with_axis(state, (1 - step_mask), axis=0) - return new_state - - -def _transpose_batch_time(x): - perm = [1, 0] + list(range(2, len(x.shape))) - return paddle.transpose(x, perm) - - -def _rnn_dynamic_graph( - cell, - inputs, - initial_states=None, - sequence_length=None, - time_major=False, - is_reverse=False, - **kwargs -): - time_step_index = 0 if time_major else 1 - flat_inputs = flatten(inputs) - time_steps = flat_inputs[0].shape[time_step_index] - - if initial_states is None: - initial_states = cell.get_initial_states( - batch_ref=inputs, batch_dim_idx=1 if time_major else 0 - ) - - if not time_major: - inputs = map_structure(_transpose_batch_time, inputs) - - if sequence_length is not None: - mask = sequence_lod.sequence_mask( - sequence_length, maxlen=time_steps, dtype=inputs.dtype - ) - mask = paddle.transpose(mask, [1, 0]) - - if is_reverse: - inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs) - mask = ( - paddle.reverse(mask, axis=[0]) - if sequence_length is not None - else None - ) - - states = initial_states - outputs = [] - for i in range(time_steps): - step_inputs = map_structure(lambda x: x[i], inputs) - step_outputs, new_states = cell(step_inputs, states, **kwargs) - if sequence_length is not None: - new_states = map_structure( - partial(_maybe_copy, step_mask=mask[i]), states, new_states - ) - states = new_states - outputs = ( - map_structure(lambda x: ArrayWrapper(x), step_outputs) - if i == 0 - else map_structure( - lambda x, x_array: x_array.append(x), step_outputs, outputs - ) - ) - - final_outputs = map_structure( - lambda x: paddle.stack(x.array, axis=time_step_index), outputs - ) - - if is_reverse: - final_outputs = map_structure( - lambda x: paddle.reverse(x, axis=time_step_index), final_outputs - ) - - final_states = new_states - return final_outputs, final_states - - -def _rnn_static_graph( - cell, - inputs, - initial_states=None, - sequence_length=None, - time_major=False, - is_reverse=False, - **kwargs -): - check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn') - if isinstance(inputs, (list, tuple)): - for i, input_x in enumerate(inputs): - check_variable_and_dtype( - input_x, 'inputs[' + str(i) + ']', ['float32', 'float64'], 'rnn' - ) - check_type( - initial_states, - 'initial_states', - (Variable, list, tuple, type(None)), - 'rnn', - ) - - check_type( - sequence_length, 'sequence_length', (Variable, type(None)), 'rnn' - ) - - def _switch_grad(x, stop=False): - x.stop_gradient = stop - return x - - if initial_states is None: - initial_states = cell.get_initial_states( - batch_ref=inputs, batch_dim_idx=1 if time_major else 0 - ) - initial_states = map_structure(_switch_grad, initial_states) - - if not time_major: - inputs = map_structure(_transpose_batch_time, inputs) - - if sequence_length: - max_seq_len = paddle.shape(flatten(inputs)[0])[0] - mask = sequence_lod.sequence_mask( - sequence_length, - maxlen=max_seq_len, - dtype=flatten(initial_states)[0].dtype, - ) - mask = paddle.transpose(mask, [1, 0]) - if is_reverse: - inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs) - mask = paddle.reverse(mask, axis=[0]) if sequence_length else None - - # StaticRNN - rnn = control_flow.StaticRNN() - with rnn.step(): - inputs = map_structure(rnn.step_input, inputs) - states = map_structure(rnn.memory, initial_states) - copy_states = map_structure(lambda x: x, states) - outputs, new_states = cell(inputs, copy_states, **kwargs) - assert_same_structure(states, new_states) - if sequence_length: - step_mask = rnn.step_input(mask) - new_states = map_structure( - partial(_maybe_copy, step_mask=step_mask), states, new_states - ) - - map_structure(rnn.update_memory, states, new_states) - flat_outputs = flatten(outputs) - map_structure(rnn.step_output, outputs) - map_structure(rnn.step_output, new_states) - - rnn_out = rnn() - final_outputs = rnn_out[: len(flat_outputs)] - final_outputs = pack_sequence_as(outputs, final_outputs) - final_states = map_structure(lambda x: x[-1], rnn_out[len(flat_outputs) :]) - final_states = pack_sequence_as(new_states, final_states) - - if is_reverse: - final_outputs = map_structure( - lambda x: paddle.reverse(x, axis=[0]), final_outputs - ) - - if not time_major: - final_outputs = map_structure(_transpose_batch_time, final_outputs) - - return (final_outputs, final_states) - - -def birnn( - cell_fw, - cell_bw, - inputs, - initial_states=None, - sequence_length=None, - time_major=False, - **kwargs -): - """ - birnn creates a bidirectional recurrent neural network specified by - RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` - (for dygraph mode :code:`cell.forward`) repeatedly until reaches to - the maximum length of `inputs` and then concat the outputs for both RNNs - along the last axis. - - Arguments: - cell_fw(RNNCellBase): An instance of `RNNCellBase`. - cell_bw(RNNCellBase): An instance of `RNNCellBase`. - inputs(Tensor): the input sequences. - If time_major is True, the shape is - `[time_steps, batch_size, input_size]` - else the shape is `[batch_size, time_steps, input_size]`. - initial_states(tuple, optional): A tuple of initial states of - `cell_fw` and `cell_bw`. - If not provided, `cell.get_initial_states` would be called to - produce initial state for each cell. Defaults to None. - sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 - or int32. The valid lengths of input sequences. Defaults to None. - If `sequence_length` is not None, the inputs are treated as - padded sequences. In each input sequence, elements whose time step - index are not less than the valid length are treated as paddings. - time_major (bool): Whether the first dimension of the input means the - time steps. Defaults to False. - **kwargs: Additional keyword arguments to pass to `forward` of each cell. - - Returns: - (outputs, final_states) - outputs (Tensor): the outputs of the bidirectional RNN. It is the - concatenation of the outputs from the forward RNN and backward - RNN along the last axis. - If time major is True, the shape is `[time_steps, batch_size, size]`, - else the shape is `[batch_size, time_steps, size]`, where size is - `cell_fw.hidden_size + cell_bw.hidden_size`. - final_states (tuple): A tuple of the final states of the forward - cell and backward cell. - - Examples: - - .. code-block:: python - - import paddle - paddle.disable_static() - - cell_fw = paddle.nn.LSTMCell(16, 32) - cell_bw = paddle.nn.LSTMCell(16, 32) - - inputs = paddle.rand((4, 23, 16)) - hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32)) - hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32)) - initial_states = ((hf, cf), (hb, cb)) - outputs, final_states = paddle.fluid.layers.birnn( - cell_fw, cell_bw, inputs, initial_states) - - """ - if initial_states is None: - states_fw = cell_fw.get_initial_states( - batch_ref=inputs, batch_dim_idx=1 if time_major else 0 - ) - states_bw = cell_fw.get_initial_states( - batch_ref=inputs, batch_dim_idx=1 if time_major else 0 - ) - else: - states_fw, states_bw = initial_states - outputs_fw, states_fw = rnn( - cell_fw, - inputs, - states_fw, - sequence_length, - time_major=time_major, - **kwargs - ) - - outputs_bw, states_bw = rnn( - cell_bw, - inputs, - states_bw, - sequence_length, - time_major=time_major, - is_reverse=True, - **kwargs - ) - - outputs = map_structure( - lambda x, y: tensor.concat([x, y], -1), outputs_fw, outputs_bw - ) - - final_states = (states_fw, states_bw) - return outputs, final_states - - def _dynamic_decode_imperative( decoder, inits=None, @@ -2175,151 +1420,3 @@ def gru_unit( ) return updated_hidden, reset_hidden_pre, gate - - -def lstm_unit( - x_t, - hidden_t_prev, - cell_t_prev, - forget_bias=0.0, - param_attr=None, - bias_attr=None, - name=None, -): - r""" - :api_attr: Static Graph - - Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for - one time step, whose implementation is based on calculations described in `RECURRENT - NEURAL NETWORK REGULARIZATION `_ . - - We add forget_bias to the biases of the forget gate in order to - reduce the scale of forgetting. The formula is as follows: - - .. math:: - - i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) - - f_{t} & = \sigma(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) - - c_{t} & = f_{t}c_{t-1} + i_{t} tanh (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) - - o_{t} & = \sigma(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) - - h_{t} & = o_{t} tanh (c_{t}) - - :math:`x_{t}` stands for ``x_t`` , corresponding to the input of current time step; - :math:`h_{t-1}` and :math:`c_{t-1}` correspond to ``hidden_t_prev`` and ``cell_t_prev`` , - representing the output of from previous time step. - :math:`i_{t}, f_{t}, c_{t}, o_{t}, h_{t}` are input gate, forget gate, cell, output gate - and hidden calculation. - - Args: - x_t(Variable): A 2D Tensor representing the input of current time step. - Its shape should be :math:`[N, M]` , where :math:`N` stands for batch - size, :math:`M` for the feature size of input. The data type should - be float32 or float64. - hidden_t_prev(Variable): A 2D Tensor representing the hidden value from - previous step. Its shape should be :math:`[N, D]` , where :math:`N` - stands for batch size, :math:`D` for the hidden size. The data type - should be same as ``x_t`` . - cell_t_prev(Variable): A 2D Tensor representing the cell value from - previous step. It has the same shape and data type with ``hidden_t_prev`` . - forget_bias (float, optional): :math:`forget\\_bias` added to the biases - of the forget gate. Default 0. - param_attr(ParamAttr, optional): To specify the weight parameter property. - Default: None, which means the default weight parameter property is used. - See usage for details in :ref:`api_fluid_ParamAttr` . - bias_attr (ParamAttr, optional): To specify the bias parameter property. - Default: None, which means the default bias parameter property is used. - See usage for details in :ref:`api_fluid_ParamAttr` . - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - tuple: The tuple contains two Tensor variables with the same shape and \ - data type with ``hidden_t_prev`` , representing the hidden value and \ - cell value which correspond to :math:`h_{t}` and :math:`c_{t}` in \ - the formula. - - Raises: - ValueError: Rank of x_t must be 2. - ValueError: Rank of hidden_t_prev must be 2. - ValueError: Rank of cell_t_prev must be 2. - ValueError: The 1st dimensions of x_t, hidden_t_prev and cell_t_prev must be the same. - ValueError: The 2nd dimensions of hidden_t_prev and cell_t_prev must be the same. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - - dict_dim, emb_dim, hidden_dim = 128, 64, 512 - data = fluid.data(name='step_data', shape=[None], dtype='int64') - x = fluid.embedding(input=data, size=[dict_dim, emb_dim]) - pre_hidden = fluid.data( - name='pre_hidden', shape=[None, hidden_dim], dtype='float32') - pre_cell = fluid.data( - name='pre_cell', shape=[None, hidden_dim], dtype='float32') - hidden = fluid.layers.lstm_unit( - x_t=x, - hidden_t_prev=pre_hidden, - cell_t_prev=pre_cell) - """ - helper = LayerHelper('lstm_unit', **locals()) - check_variable_and_dtype(x_t, 'x_t', ['float32', 'float64'], 'lstm_unit') - check_variable_and_dtype( - hidden_t_prev, 'hidden_t_prev', ['float32', 'float64'], 'lstm_unit' - ) - check_variable_and_dtype( - cell_t_prev, 'cell_t_prev', ['float32', 'float64'], 'lstm_unit' - ) - if len(x_t.shape) != 2: - raise ValueError("Rank of x_t must be 2.") - - if len(hidden_t_prev.shape) != 2: - raise ValueError("Rank of hidden_t_prev must be 2.") - - if len(cell_t_prev.shape) != 2: - raise ValueError("Rank of cell_t_prev must be 2.") - - if ( - x_t.shape[0] != hidden_t_prev.shape[0] - or x_t.shape[0] != cell_t_prev.shape[0] - ): - raise ValueError( - "The 1st dimensions of x_t, hidden_t_prev and " - "cell_t_prev must be the same." - ) - - if hidden_t_prev.shape[1] != cell_t_prev.shape[1]: - raise ValueError( - "The 2nd dimensions of hidden_t_prev and " - "cell_t_prev must be the same." - ) - - if bias_attr is None: - bias_attr = ParamAttr() - - size = cell_t_prev.shape[1] - concat_out = nn.concat(input=[x_t, hidden_t_prev], axis=1) - fc_out = nn.fc( - input=concat_out, - size=4 * size, - param_attr=param_attr, - bias_attr=bias_attr, - ) - dtype = x_t.dtype - c = helper.create_variable_for_type_inference(dtype) - h = helper.create_variable_for_type_inference(dtype) - - helper.append_op( - type='lstm_unit', - inputs={"X": fc_out, "C_prev": cell_t_prev}, - outputs={"C": c, "H": h}, - attrs={"forget_bias": forget_bias}, - ) - - return h, c diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 14fa6974844fa31ed225a4a9c4159c3262cfd60a..c28044345b42b24017d73d508429adf479167071 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -2179,26 +2179,6 @@ class TestBook(LayerTest): x, kernel_size=[5, 3], stride=[1, 2], padding=(2, 1) ) - def make_lstm_unit(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - x_t_data = self._get_data( - name='x_t_data', shape=[10, 10], dtype='float32' - ) - x_t = layers.fc(input=x_t_data, size=10) - prev_hidden_data = self._get_data( - name='prev_hidden_data', shape=[10, 30], dtype='float32' - ) - prev_hidden = layers.fc(input=prev_hidden_data, size=30) - prev_cell_data = self._get_data( - name='prev_cell', shape=[10, 30], dtype='float32' - ) - prev_cell = layers.fc(input=prev_cell_data, size=30) - return layers.lstm_unit( - x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell - ) - def make_softmax(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() diff --git a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py index e389ae936b6fae8f59ced2d2b6ebe4665f028aad..8a1b2fc238b22935fda8660b05f00d17f126f6b4 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py @@ -17,10 +17,6 @@ import unittest import numpy as np from op_test import OpTest -from paddle import fluid -from paddle.fluid.framework import Program, program_guard -from paddle.fluid.layers import lstm_unit - def sigmoid_np(x): return 1.0 / (1.0 + np.exp(-x)) @@ -30,79 +26,6 @@ def tanh_np(x): return 2 * sigmoid_np(2.0 * x) - 1.0 -class LstmUnitTestError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - batch_size, dict_dim, emb_dim, hidden_dim = 32, 128, 64, 512 - data = fluid.data( - name='step_data', shape=[batch_size], dtype='int64' - ) - inputs = fluid.embedding(input=data, size=[dict_dim, emb_dim]) - pre_hidden = fluid.data( - name='pre_hidden', - shape=[batch_size, hidden_dim], - dtype='float32', - ) - pre_cell = fluid.data( - name='pre_cell', shape=[batch_size, hidden_dim], dtype='float32' - ) - - np_input = np.random.uniform( - -0.1, 0.1, (batch_size, emb_dim) - ).astype('float64') - np_pre_hidden = np.random.uniform( - -0.1, 0.1, (batch_size, hidden_dim) - ).astype('float64') - np_pre_cell = np.random.uniform( - -0.1, 0.1, (batch_size, hidden_dim) - ).astype('float64') - - def test_input_Variable(): - lstm_unit(np_input, pre_hidden, pre_cell) - - self.assertRaises(TypeError, test_input_Variable) - - def test_pre_hidden_Variable(): - lstm_unit(inputs, np_pre_hidden, pre_cell) - - self.assertRaises(TypeError, test_pre_hidden_Variable) - - def test_pre_cell_Variable(): - lstm_unit(inputs, pre_hidden, np_pre_cell) - - self.assertRaises(TypeError, test_pre_cell_Variable) - - def test_input_type(): - error_input = fluid.data( - name='error_input', - shape=[batch_size, emb_dim], - dtype='int32', - ) - lstm_unit(error_input, pre_hidden, pre_cell) - - self.assertRaises(TypeError, test_input_type) - - def test_pre_hidden_type(): - error_pre_hidden = fluid.data( - name='error_pre_hidden', - shape=[batch_size, hidden_dim], - dtype='int32', - ) - lstm_unit(inputs, error_pre_hidden, pre_cell) - - self.assertRaises(TypeError, test_pre_hidden_type) - - def test_pre_cell_type(): - error_pre_cell = fluid.data( - name='error_pre_cell', - shape=[batch_size, hidden_dim], - dtype='int32', - ) - lstm_unit(inputs, pre_hidden, error_pre_cell) - - self.assertRaises(TypeError, test_pre_cell_type) - - class LstmUnitTest(OpTest): def setUp(self): self.op_type = "lstm_unit" diff --git a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py index 2302e6f0d241fddb3faf2d187a7122ba56a1e9f4..9c9c5520c027d8b7444e7a7343700bce23dbc1a5 100644 --- a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py +++ b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py @@ -16,296 +16,20 @@ import unittest import numpy import numpy as np +from rnn.rnn_numpy import LSTMCell +from rnn.rnn_numpy import rnn as numpy_rnn import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.layers as layers import paddle.fluid.layers.utils as utils -from paddle.fluid import contrib, framework -from paddle.fluid.contrib.layers import basic_lstm +from paddle.fluid import framework from paddle.fluid.executor import Executor from paddle.fluid.framework import Program, program_guard -from paddle.fluid.layers import rnn as dynamic_rnn -from paddle.fluid.layers.rnn import GRUCell, LSTMCell, RNNCell +from paddle.nn.layer.rnn import rnn as dynamic_rnn - -class TestLSTMCellError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - batch_size, input_size, hidden_size = 4, 16, 16 - inputs = fluid.data( - name='inputs', shape=[None, input_size], dtype='float32' - ) - pre_hidden = fluid.data( - name='pre_hidden', shape=[None, hidden_size], dtype='float32' - ) - pre_cell = fluid.data( - name='pre_cell', shape=[None, hidden_size], dtype='float32' - ) - cell = LSTMCell(hidden_size) - - def test_input_Variable(): - np_input = np.random.random((batch_size, input_size)).astype( - "float32" - ) - cell(np_input, [pre_hidden, pre_cell]) - - self.assertRaises(TypeError, test_input_Variable) - - def test_pre_hidden_Variable(): - np_pre_hidden = np.random.random( - (batch_size, hidden_size) - ).astype("float32") - cell(inputs, [np_pre_hidden, pre_cell]) - - self.assertRaises(TypeError, test_pre_hidden_Variable) - - def test_pre_cell_Variable(): - np_pre_cell = np.random.random((batch_size, input_size)).astype( - "float32" - ) - cell(inputs, [pre_hidden, np_pre_cell]) - - self.assertRaises(TypeError, test_pre_cell_Variable) - - def test_input_type(): - error_inputs = fluid.data( - name='error_inputs', shape=[None, input_size], dtype='int32' - ) - cell(error_inputs, [pre_hidden, pre_cell]) - - self.assertRaises(TypeError, test_input_type) - - def test_pre_hidden_type(): - error_pre_hidden = fluid.data( - name='error_pre_hidden', - shape=[None, hidden_size], - dtype='int32', - ) - cell(inputs, [error_pre_hidden, pre_cell]) - - self.assertRaises(TypeError, test_pre_hidden_type) - - def test_pre_cell_type(): - error_pre_cell = fluid.data( - name='error_pre_cell', - shape=[None, hidden_size], - dtype='int32', - ) - cell(inputs, [pre_hidden, error_pre_cell]) - - self.assertRaises(TypeError, test_pre_cell_type) - - def test_dtype(): - # the input type must be Variable - LSTMCell(hidden_size, dtype="int32") - - self.assertRaises(TypeError, test_dtype) - - -class TestLSTMCell(unittest.TestCase): - def setUp(self): - self.batch_size = 4 - self.input_size = 16 - self.hidden_size = 16 - - def test_run(self): - inputs = fluid.data( - name='inputs', shape=[None, self.input_size], dtype='float32' - ) - pre_hidden = fluid.data( - name='pre_hidden', shape=[None, self.hidden_size], dtype='float32' - ) - pre_cell = fluid.data( - name='pre_cell', shape=[None, self.hidden_size], dtype='float32' - ) - - cell = LSTMCell(self.hidden_size) - lstm_hidden_new, lstm_states_new = cell(inputs, [pre_hidden, pre_cell]) - - lstm_unit = contrib.layers.rnn_impl.BasicLSTMUnit( - "basicLSTM", - self.hidden_size, - None, - None, - None, - None, - 1.0, - "float32", - ) - lstm_hidden, lstm_cell = lstm_unit(inputs, pre_hidden, pre_cell) - - if core.is_compiled_with_cuda(): - place = core.CUDAPlace(0) - else: - place = core.CPUPlace() - exe = Executor(place) - exe.run(framework.default_startup_program()) - - inputs_np = np.random.uniform( - -0.1, 0.1, (self.batch_size, self.input_size) - ).astype('float32') - pre_hidden_np = np.random.uniform( - -0.1, 0.1, (self.batch_size, self.hidden_size) - ).astype('float32') - pre_cell_np = np.random.uniform( - -0.1, 0.1, (self.batch_size, self.hidden_size) - ).astype('float32') - - param_names = [ - ["LSTMCell/BasicLSTMUnit_0.w_0", "basicLSTM/BasicLSTMUnit_0.w_0"], - ["LSTMCell/BasicLSTMUnit_0.b_0", "basicLSTM/BasicLSTMUnit_0.b_0"], - ] - - for names in param_names: - param = np.array( - fluid.global_scope().find_var(names[0]).get_tensor() - ) - param = np.random.uniform(-0.1, 0.1, size=param.shape).astype( - 'float32' - ) - fluid.global_scope().find_var(names[0]).get_tensor().set( - param, place - ) - fluid.global_scope().find_var(names[1]).get_tensor().set( - param, place - ) - - out = exe.run( - feed={ - 'inputs': inputs_np, - 'pre_hidden': pre_hidden_np, - 'pre_cell': pre_cell_np, - }, - fetch_list=[lstm_hidden_new, lstm_hidden], - ) - - np.testing.assert_allclose(out[0], out[1], rtol=0.0001, atol=0) - - -class TestGRUCellError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - batch_size, input_size, hidden_size = 4, 16, 16 - inputs = fluid.data( - name='inputs', shape=[None, input_size], dtype='float32' - ) - pre_hidden = layers.data( - name='pre_hidden', - shape=[None, hidden_size], - append_batch_size=False, - dtype='float32', - ) - cell = GRUCell(hidden_size) - - def test_input_Variable(): - np_input = np.random.random((batch_size, input_size)).astype( - "float32" - ) - cell(np_input, pre_hidden) - - self.assertRaises(TypeError, test_input_Variable) - - def test_pre_hidden_Variable(): - np_pre_hidden = np.random.random( - (batch_size, hidden_size) - ).astype("float32") - cell(inputs, np_pre_hidden) - - self.assertRaises(TypeError, test_pre_hidden_Variable) - - def test_input_type(): - error_inputs = fluid.data( - name='error_inputs', shape=[None, input_size], dtype='int32' - ) - cell(error_inputs, pre_hidden) - - self.assertRaises(TypeError, test_input_type) - - def test_pre_hidden_type(): - error_pre_hidden = fluid.data( - name='error_pre_hidden', - shape=[None, hidden_size], - dtype='int32', - ) - cell(inputs, error_pre_hidden) - - self.assertRaises(TypeError, test_pre_hidden_type) - - def test_dtype(): - # the input type must be Variable - GRUCell(hidden_size, dtype="int32") - - self.assertRaises(TypeError, test_dtype) - - -class TestGRUCell(unittest.TestCase): - def setUp(self): - self.batch_size = 4 - self.input_size = 16 - self.hidden_size = 16 - - def test_run(self): - inputs = fluid.data( - name='inputs', shape=[None, self.input_size], dtype='float32' - ) - pre_hidden = layers.data( - name='pre_hidden', - shape=[None, self.hidden_size], - append_batch_size=False, - dtype='float32', - ) - - cell = GRUCell(self.hidden_size) - gru_hidden_new, _ = cell(inputs, pre_hidden) - - gru_unit = contrib.layers.rnn_impl.BasicGRUUnit( - "basicGRU", self.hidden_size, None, None, None, None, "float32" - ) - gru_hidden = gru_unit(inputs, pre_hidden) - - if core.is_compiled_with_cuda(): - place = core.CUDAPlace(0) - else: - place = core.CPUPlace() - exe = Executor(place) - exe.run(framework.default_startup_program()) - - inputs_np = np.random.uniform( - -0.1, 0.1, (self.batch_size, self.input_size) - ).astype('float32') - pre_hidden_np = np.random.uniform( - -0.1, 0.1, (self.batch_size, self.hidden_size) - ).astype('float32') - - param_names = [ - ["GRUCell/BasicGRUUnit_0.w_0", "basicGRU/BasicGRUUnit_0.w_0"], - ["GRUCell/BasicGRUUnit_0.w_1", "basicGRU/BasicGRUUnit_0.w_1"], - ["GRUCell/BasicGRUUnit_0.b_0", "basicGRU/BasicGRUUnit_0.b_0"], - ["GRUCell/BasicGRUUnit_0.b_1", "basicGRU/BasicGRUUnit_0.b_1"], - ] - - for names in param_names: - param = np.array( - fluid.global_scope().find_var(names[0]).get_tensor() - ) - param = np.random.uniform(-0.1, 0.1, size=param.shape).astype( - 'float32' - ) - fluid.global_scope().find_var(names[0]).get_tensor().set( - param, place - ) - fluid.global_scope().find_var(names[1]).get_tensor().set( - param, place - ) - - out = exe.run( - feed={'inputs': inputs_np, 'pre_hidden': pre_hidden_np}, - fetch_list=[gru_hidden_new, gru_hidden], - ) - - np.testing.assert_allclose(out[0], out[1], rtol=0.0001, atol=0) +paddle.enable_static() class TestRnnError(unittest.TestCase): @@ -336,7 +60,9 @@ class TestRnnError(unittest.TestCase): inputs_dynamic_rnn = paddle.transpose( inputs_basic_lstm, perm=[1, 0, 2] ) - cell = LSTMCell(hidden_size, name="LSTMCell_for_rnn") + cell = paddle.nn.LSTMCell( + input_size, hidden_size, name="LSTMCell_for_rnn" + ) np_inputs_dynamic_rnn = np.random.random( (seq_len, batch_size, input_size) ).astype("float32") @@ -362,7 +88,9 @@ class TestRnnError(unittest.TestCase): self.assertRaises(TypeError, test_input_list) def test_initial_states_type(): - cell = GRUCell(hidden_size, name="GRUCell_for_rnn") + cell = paddle.nn.GRUCell( + input_size, hidden_size, name="GRUCell_for_rnn" + ) error_initial_states = np.random.random( (batch_size, hidden_size) ).astype("float32") @@ -417,36 +145,9 @@ class TestRnn(unittest.TestCase): self.seq_len = 4 def test_run(self): - inputs_basic_lstm = fluid.data( - name='inputs_basic_lstm', - shape=[None, None, self.input_size], - dtype='float32', - ) - sequence_length = fluid.data( - name="sequence_length", shape=[None], dtype='int64' - ) - inputs_dynamic_rnn = paddle.transpose(inputs_basic_lstm, perm=[1, 0, 2]) - cell = LSTMCell(self.hidden_size, name="LSTMCell_for_rnn") - output, final_state = dynamic_rnn( - cell=cell, - inputs=inputs_dynamic_rnn, - sequence_length=sequence_length, - is_reverse=False, - ) - output_new = paddle.transpose(output, perm=[1, 0, 2]) - - rnn_out, last_hidden, last_cell = basic_lstm( - inputs_basic_lstm, - None, - None, - self.hidden_size, - num_layers=1, - batch_first=False, - bidirectional=False, - sequence_length=sequence_length, - forget_bias=1.0, - ) + numpy_cell = LSTMCell(self.input_size, self.hidden_size) + dynamic_cell = paddle.nn.LSTMCell(self.input_size, self.hidden_size) if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) @@ -455,60 +156,68 @@ class TestRnn(unittest.TestCase): exe = Executor(place) exe.run(framework.default_startup_program()) - inputs_basic_lstm_np = np.random.uniform( - -0.1, 0.1, (self.seq_len, self.batch_size, self.input_size) - ).astype('float32') + state = numpy_cell.parameters + for k, v in dynamic_cell.named_parameters(): + param = np.random.uniform(-0.1, 0.1, size=state[k].shape).astype( + 'float64' + ) + setattr(numpy_cell, k, param) + fluid.global_scope().find_var(v.name).get_tensor().set(param, place) + + sequence_length = fluid.data( + name="sequence_length", shape=[None], dtype='int64' + ) + inputs_rnn = fluid.data( + name='inputs_rnn', + shape=[None, None, self.input_size], + dtype='float64', + ) + pre_hidden = fluid.data( + name='pre_hidden', shape=[None, self.hidden_size], dtype='float64' + ) + pre_cell = fluid.data( + name='pre_cell', shape=[None, self.hidden_size], dtype='float64' + ) + + dynamic_output, dynamic_final_state = dynamic_rnn( + cell=dynamic_cell, + inputs=inputs_rnn, + sequence_length=sequence_length, + initial_states=(pre_hidden, pre_cell), + is_reverse=False, + ) + + inputs_rnn_np = np.random.uniform( + -0.1, 0.1, (self.batch_size, self.seq_len, self.input_size) + ).astype('float64') sequence_length_np = ( np.ones(self.batch_size, dtype='int64') * self.seq_len ) - - inputs_np = np.random.uniform( - -0.1, 0.1, (self.batch_size, self.input_size) - ).astype('float32') pre_hidden_np = np.random.uniform( -0.1, 0.1, (self.batch_size, self.hidden_size) - ).astype('float32') + ).astype('float64') pre_cell_np = np.random.uniform( -0.1, 0.1, (self.batch_size, self.hidden_size) - ).astype('float32') - - param_names = [ - [ - "LSTMCell_for_rnn/BasicLSTMUnit_0.w_0", - "basic_lstm_layers_0/BasicLSTMUnit_0.w_0", - ], - [ - "LSTMCell_for_rnn/BasicLSTMUnit_0.b_0", - "basic_lstm_layers_0/BasicLSTMUnit_0.b_0", - ], - ] - - for names in param_names: - param = np.array( - fluid.global_scope().find_var(names[0]).get_tensor() - ) - param = np.random.uniform(-0.1, 0.1, size=param.shape).astype( - 'float32' - ) - fluid.global_scope().find_var(names[0]).get_tensor().set( - param, place - ) - fluid.global_scope().find_var(names[1]).get_tensor().set( - param, place - ) + ).astype('float64') - out = exe.run( + o1, _ = numpy_rnn( + cell=numpy_cell, + inputs=inputs_rnn_np, + initial_states=(pre_hidden_np, pre_cell_np), + sequence_length=sequence_length_np, + is_reverse=False, + ) + + o2 = exe.run( feed={ - 'inputs_basic_lstm': inputs_basic_lstm_np, + 'inputs_rnn': inputs_rnn_np, 'sequence_length': sequence_length_np, - 'inputs': inputs_np, 'pre_hidden': pre_hidden_np, 'pre_cell': pre_cell_np, }, - fetch_list=[output_new, rnn_out], - ) - - np.testing.assert_allclose(out[0], out[1], rtol=0.0001) + fetch_list=[dynamic_output], + )[0] + np.testing.assert_allclose(o1, o2, rtol=0.001) class TestRnnUtil(unittest.TestCase): @@ -528,218 +237,5 @@ class TestRnnUtil(unittest.TestCase): pass -class EncoderCell(RNNCell): - """Encoder Cell""" - - def __init__( - self, - num_layers, - hidden_size, - dropout_prob=0.0, - init_scale=0.1, - ): - self.num_layers = num_layers - self.hidden_size = hidden_size - self.dropout_prob = dropout_prob - self.lstm_cells = [] - - for i in range(num_layers): - self.lstm_cells.append(LSTMCell(hidden_size)) - - def call(self, step_input, states): - new_states = [] - for i in range(self.num_layers): - out, new_state = self.lstm_cells[i](step_input, states[i]) - step_input = ( - layers.dropout( - out, - self.dropout_prob, - ) - if self.dropout_prob - else out - ) - new_states.append(new_state) - return step_input, new_states - - @property - def state_shape(self): - return [cell.state_shape for cell in self.lstm_cells] - - -class DecoderCell(RNNCell): - """Decoder Cell""" - - def __init__(self, num_layers, hidden_size, dropout_prob=0.0): - self.num_layers = num_layers - self.hidden_size = hidden_size - self.dropout_prob = dropout_prob - self.lstm_cells = [] - for i in range(num_layers): - self.lstm_cells.append(LSTMCell(hidden_size)) - - def call(self, step_input, states): - new_lstm_states = [] - for i in range(self.num_layers): - out, new_lstm_state = self.lstm_cells[i](step_input, states[i]) - step_input = ( - layers.dropout( - out, - self.dropout_prob, - ) - if self.dropout_prob - else out - ) - new_lstm_states.append(new_lstm_state) - return step_input, new_lstm_states - - -def def_seq2seq_model( - num_layers, hidden_size, dropout_prob, src_vocab_size, trg_vocab_size -): - "vanilla seq2seq model" - # data - source = fluid.data(name="src", shape=[None, None], dtype="int64") - source_length = fluid.data( - name="src_sequence_length", shape=[None], dtype="int64" - ) - target = fluid.data(name="trg", shape=[None, None], dtype="int64") - target_length = fluid.data( - name="trg_sequence_length", shape=[None], dtype="int64" - ) - label = fluid.data(name="label", shape=[None, None, 1], dtype="int64") - - # embedding - src_emb = fluid.embedding(source, (src_vocab_size, hidden_size)) - tar_emb = fluid.embedding(target, (src_vocab_size, hidden_size)) - - # encoder - enc_cell = EncoderCell(num_layers, hidden_size, dropout_prob) - enc_output, enc_final_state = dynamic_rnn( - cell=enc_cell, inputs=src_emb, sequence_length=source_length - ) - - # decoder - dec_cell = DecoderCell(num_layers, hidden_size, dropout_prob) - dec_output, dec_final_state = dynamic_rnn( - cell=dec_cell, inputs=tar_emb, initial_states=enc_final_state - ) - logits = layers.fc( - dec_output, - size=trg_vocab_size, - num_flatten_dims=len(dec_output.shape) - 1, - bias_attr=False, - ) - - # loss - loss = paddle.nn.functional.softmax_with_cross_entropy( - logits=logits, label=label, soft_label=False - ) - loss = layers.unsqueeze(loss, axes=[2]) - max_tar_seq_len = paddle.shape(target)[1] - tar_mask = layers.sequence_mask( - target_length, maxlen=max_tar_seq_len, dtype="float32" - ) - loss = loss * tar_mask - loss = paddle.mean(loss, axis=[0]) - loss = paddle.sum(loss) - - # optimizer - optimizer = fluid.optimizer.Adam(0.001) - optimizer.minimize(loss) - return loss - - -class TestSeq2SeqModel(unittest.TestCase): - """ - Test cases to confirm seq2seq api training correctly. - """ - - def setUp(self): - np.random.seed(123) - self.model_hparams = { - "num_layers": 2, - "hidden_size": 128, - "dropout_prob": 0.1, - "src_vocab_size": 100, - "trg_vocab_size": 100, - } - - self.iter_num = iter_num = 2 - self.batch_size = batch_size = 4 - src_seq_len = 10 - trg_seq_len = 12 - self.data = { - "src": np.random.randint( - 2, - self.model_hparams["src_vocab_size"], - (iter_num * batch_size, src_seq_len), - ).astype("int64"), - "src_sequence_length": np.random.randint( - 1, src_seq_len, (iter_num * batch_size,) - ).astype("int64"), - "trg": np.random.randint( - 2, - self.model_hparams["src_vocab_size"], - (iter_num * batch_size, trg_seq_len), - ).astype("int64"), - "trg_sequence_length": np.random.randint( - 1, trg_seq_len, (iter_num * batch_size,) - ).astype("int64"), - "label": np.random.randint( - 2, - self.model_hparams["src_vocab_size"], - (iter_num * batch_size, trg_seq_len, 1), - ).astype("int64"), - } - - place = ( - core.CUDAPlace(0) - if core.is_compiled_with_cuda() - else core.CPUPlace() - ) - self.exe = Executor(place) - - def test_seq2seq_model(self): - main_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(main_program, startup_program): - cost = def_seq2seq_model(**self.model_hparams) - self.exe.run(startup_program) - for iter_idx in range(self.iter_num): - cost_val = self.exe.run( - feed={ - "src": self.data["src"][ - iter_idx - * self.batch_size : (iter_idx + 1) - * self.batch_size, - :, - ], - "src_sequence_length": self.data["src_sequence_length"][ - iter_idx - * self.batch_size : (iter_idx + 1) - * self.batch_size - ], - "trg": self.data["trg"][ - iter_idx - * self.batch_size : (iter_idx + 1) - * self.batch_size, - :, - ], - "trg_sequence_length": self.data["trg_sequence_length"][ - iter_idx - * self.batch_size : (iter_idx + 1) - * self.batch_size - ], - "label": self.data["label"][ - iter_idx - * self.batch_size : (iter_idx + 1) - * self.batch_size - ], - }, - fetch_list=[cost], - )[0] - print("iter_idx: %d, cost: %f" % (iter_idx, cost_val)) - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py index f990c2171b92e32928c83bf8f2ddb0c664956e6c..91b3adcb92d262408ef38c7d9b71cd731cfea96f 100644 --- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py +++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py @@ -19,12 +19,10 @@ import numpy as np import paddle import paddle.fluid as fluid -import paddle.fluid.core as core import paddle.fluid.layers as layers import paddle.nn as nn from paddle import Model, set_device from paddle.fluid.dygraph import Layer -from paddle.fluid.executor import Executor from paddle.fluid.framework import _test_eager_guard from paddle.nn import BeamSearchDecoder, dynamic_decode from paddle.static import InputSpec as Input @@ -32,257 +30,6 @@ from paddle.static import InputSpec as Input paddle.enable_static() -class EncoderCell(layers.RNNCell): - def __init__(self, num_layers, hidden_size, dropout_prob=0.0): - self.num_layers = num_layers - self.hidden_size = hidden_size - self.dropout_prob = dropout_prob - self.lstm_cells = [ - layers.LSTMCell(hidden_size) for i in range(num_layers) - ] - - def call(self, step_input, states): - new_states = [] - for i in range(self.num_layers): - out, new_state = self.lstm_cells[i](step_input, states[i]) - step_input = ( - layers.dropout(out, self.dropout_prob) - if self.dropout_prob > 0 - else out - ) - new_states.append(new_state) - return step_input, new_states - - @property - def state_shape(self): - return [cell.state_shape for cell in self.lstm_cells] - - -class DecoderCell(layers.RNNCell): - def __init__(self, num_layers, hidden_size, dropout_prob=0.0): - self.num_layers = num_layers - self.hidden_size = hidden_size - self.dropout_prob = dropout_prob - self.lstm_cells = [ - layers.LSTMCell(hidden_size) for i in range(num_layers) - ] - - def attention(self, hidden, encoder_output, encoder_padding_mask): - query = layers.fc( - hidden, size=encoder_output.shape[-1], bias_attr=False - ) - attn_scores = paddle.matmul( - layers.unsqueeze(query, [1]), encoder_output, transpose_y=True - ) - if encoder_padding_mask is not None: - attn_scores = paddle.add(attn_scores, encoder_padding_mask) - attn_scores = paddle.nn.functional.softmax(attn_scores) - attn_out = paddle.squeeze( - paddle.matmul(attn_scores, encoder_output), [1] - ) - attn_out = layers.concat([attn_out, hidden], 1) - attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False) - return attn_out - - def call( - self, step_input, states, encoder_output, encoder_padding_mask=None - ): - lstm_states, input_feed = states - new_lstm_states = [] - step_input = layers.concat([step_input, input_feed], 1) - for i in range(self.num_layers): - out, new_lstm_state = self.lstm_cells[i](step_input, lstm_states[i]) - step_input = ( - layers.dropout(out, self.dropout_prob) - if self.dropout_prob > 0 - else out - ) - new_lstm_states.append(new_lstm_state) - out = self.attention(step_input, encoder_output, encoder_padding_mask) - return out, [new_lstm_states, out] - - -class Encoder: - def __init__(self, num_layers, hidden_size, dropout_prob=0.0): - self.encoder_cell = EncoderCell(num_layers, hidden_size, dropout_prob) - - def __call__(self, src_emb, src_sequence_length): - encoder_output, encoder_final_state = layers.rnn( - cell=self.encoder_cell, - inputs=src_emb, - sequence_length=src_sequence_length, - is_reverse=False, - ) - return encoder_output, encoder_final_state - - -class Decoder: - def __init__( - self, - num_layers, - hidden_size, - dropout_prob, - decoding_strategy="infer_sample", - max_decoding_length=20, - ): - self.decoder_cell = DecoderCell(num_layers, hidden_size, dropout_prob) - self.decoding_strategy = decoding_strategy - self.max_decoding_length = ( - None - if (self.decoding_strategy == "train_greedy") - else max_decoding_length - ) - - def __call__( - self, - decoder_initial_states, - encoder_output, - encoder_padding_mask, - **kwargs - ): - output_layer = kwargs.pop("output_layer", None) - - beam_size = kwargs.get("beam_size", 4) - encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch( - encoder_output, beam_size - ) - encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch( - encoder_padding_mask, beam_size - ) - decoder = BeamSearchDecoder( - cell=self.decoder_cell, output_fn=output_layer, **kwargs - ) - - ( - decoder_output, - decoder_final_state, - dec_seq_lengths, - ) = layers.dynamic_decode( - decoder, - inits=decoder_initial_states, - max_step_num=self.max_decoding_length, - encoder_output=encoder_output, - encoder_padding_mask=encoder_padding_mask, - impute_finished=False # for test coverage - if self.decoding_strategy == "beam_search" - else True, - is_test=True if self.decoding_strategy == "beam_search" else False, - return_length=True, - ) - return decoder_output, decoder_final_state, dec_seq_lengths - - -class Seq2SeqModel: - """Seq2Seq model: RNN encoder-decoder with attention""" - - def __init__( - self, - num_layers, - hidden_size, - dropout_prob, - src_vocab_size, - trg_vocab_size, - start_token, - end_token, - decoding_strategy="infer_sample", - max_decoding_length=20, - beam_size=4, - ): - self.start_token, self.end_token = start_token, end_token - self.max_decoding_length, self.beam_size = ( - max_decoding_length, - beam_size, - ) - self.src_embeder = paddle.nn.Embedding( - src_vocab_size, - hidden_size, - weight_attr=fluid.ParamAttr(name="source_embedding"), - ) - self.trg_embeder = paddle.nn.Embedding( - trg_vocab_size, - hidden_size, - weight_attr=fluid.ParamAttr(name="target_embedding"), - ) - self.encoder = Encoder(num_layers, hidden_size, dropout_prob) - self.decoder = Decoder( - num_layers, - hidden_size, - dropout_prob, - decoding_strategy, - max_decoding_length, - ) - self.output_layer = lambda x: layers.fc( - x, - size=trg_vocab_size, - num_flatten_dims=len(x.shape) - 1, - param_attr=fluid.ParamAttr(), - bias_attr=False, - ) - - def __call__(self, src, src_length, trg=None, trg_length=None): - # encoder - encoder_output, encoder_final_state = self.encoder( - self.src_embeder(src), src_length - ) - - decoder_initial_states = [ - encoder_final_state, - self.decoder.decoder_cell.get_initial_states( - batch_ref=encoder_output, shape=[encoder_output.shape[-1]] - ), - ] - src_mask = layers.sequence_mask( - src_length, maxlen=paddle.shape(src)[1], dtype="float32" - ) - encoder_padding_mask = (src_mask - 1.0) * 1e9 - encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) - - # decoder - decoder_kwargs = ( - { - "inputs": self.trg_embeder(trg), - "sequence_length": trg_length, - } - if self.decoder.decoding_strategy == "train_greedy" - else ( - { - "embedding_fn": self.trg_embeder, - "beam_size": self.beam_size, - "start_token": self.start_token, - "end_token": self.end_token, - } - if self.decoder.decoding_strategy == "beam_search" - else { - "embedding_fn": self.trg_embeder, - "start_tokens": layers.fill_constant_batch_size_like( - input=encoder_output, - shape=[-1], - dtype=src.dtype, - value=self.start_token, - ), - "end_token": self.end_token, - } - ) - ) - decoder_kwargs["output_layer"] = self.output_layer - - (decoder_output, decoder_final_state, dec_seq_lengths) = self.decoder( - decoder_initial_states, - encoder_output, - encoder_padding_mask, - **decoder_kwargs - ) - if self.decoder.decoding_strategy == "beam_search": # for inference - return decoder_output - logits, samples, sample_length = ( - decoder_output.cell_outputs, - decoder_output.sample_ids, - dec_seq_lengths, - ) - probs = paddle.nn.functional.softmax(logits) - return probs, samples, sample_length - - class PolicyGradient: """policy gradient""" @@ -477,91 +224,6 @@ class SeqPGAgent: return results -class TestDynamicDecode(unittest.TestCase): - def setUp(self): - np.random.seed(123) - self.model_hparams = { - "num_layers": 2, - "hidden_size": 32, - "dropout_prob": 0.1, - "src_vocab_size": 100, - "trg_vocab_size": 100, - "start_token": 0, - "end_token": 1, - "decoding_strategy": "infer_greedy", - "max_decoding_length": 10, - } - - self.iter_num = iter_num = 2 - self.batch_size = batch_size = 4 - src_seq_len = 10 - trg_seq_len = 12 - self.data = { - "src": np.random.randint( - 2, - self.model_hparams["src_vocab_size"], - (iter_num * batch_size, src_seq_len), - ).astype("int64"), - "src_sequence_length": np.random.randint( - 1, src_seq_len, (iter_num * batch_size,) - ).astype("int64"), - "trg": np.random.randint( - 2, - self.model_hparams["src_vocab_size"], - (iter_num * batch_size, trg_seq_len), - ).astype("int64"), - "trg_sequence_length": np.random.randint( - 1, trg_seq_len, (iter_num * batch_size,) - ).astype("int64"), - "label": np.random.randint( - 2, - self.model_hparams["src_vocab_size"], - (iter_num * batch_size, trg_seq_len, 1), - ).astype("int64"), - } - - place = ( - core.CUDAPlace(0) - if core.is_compiled_with_cuda() - else core.CPUPlace() - ) - self.exe = Executor(place) - - def test_beam_search_infer(self): - paddle.set_default_dtype("float32") - paddle.enable_static() - self.model_hparams["decoding_strategy"] = "beam_search" - main_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(main_program, startup_program): - source = fluid.data(name="src", shape=[None, None], dtype="int64") - source_length = fluid.data( - name="src_sequence_length", shape=[None], dtype="int64" - ) - model = Seq2SeqModel(**self.model_hparams) - output = model(source, source_length) - - self.exe.run(startup_program) - for iter_idx in range(self.iter_num): - trans_ids = self.exe.run( - program=main_program, - feed={ - "src": self.data["src"][ - iter_idx - * self.batch_size : (iter_idx + 1) - * self.batch_size, - :, - ], - "src_sequence_length": self.data["src_sequence_length"][ - iter_idx - * self.batch_size : (iter_idx + 1) - * self.batch_size - ], - }, - fetch_list=[output], - )[0] - - class ModuleApiTest(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 8a3ea8a209b0ee745b30c06904ad2c044cc9a7b3..59dfa2413d5c393fd7569f5c704b4d31daadc8c8 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -14,26 +14,389 @@ import math from collections.abc import Sequence -from functools import reduce +from functools import partial, reduce import numpy as np import paddle from paddle import _C_ops, _legacy_C_ops, framework, in_dynamic_mode -from paddle.fluid.framework import in_dygraph_mode -from paddle.fluid.layers import utils +from paddle.fluid.data_feeder import check_type, check_variable_and_dtype +from paddle.fluid.framework import _non_static_mode, in_dygraph_mode +from paddle.fluid.layers import control_flow, sequence_lod, utils from paddle.fluid.layers.utils import flatten, map_structure from paddle.framework import core from paddle.nn import Layer from paddle.nn import functional as F from paddle.nn import initializer as I -from paddle.static import default_startup_program, program_guard +from paddle.static import Variable, default_startup_program, program_guard from .container import LayerList __all__ = [] +def rnn( + cell, + inputs, + initial_states=None, + sequence_length=None, + time_major=False, + is_reverse=False, + **kwargs +): + r""" + rnn creates a recurrent neural network specified by RNNCell `cell`, + which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) + repeatedly until reaches to the maximum length of `inputs`. + + Parameters: + cell(RNNCellBase): An instance of `RNNCellBase`. + inputs(Tensor): the input sequences. + If time_major is True, the shape is + `[time_steps, batch_size, input_size]` + else the shape is `[batch_size, time_steps, input_size]`. + initial_states(Tensor|tuple|list, optional): the initial state of the + rnn cell. Tensor or a possibly nested structure of tensors. If not + provided, `cell.get_initial_states` would be called to produce + the initial state. Defaults to None. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. Defaults to None. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whose time step + index are not less than the valid length are treated as paddings. + time_major (bool, optional): Whether the first dimension of the input means the + time steps. Defaults to False. + is_reverse (bool, optional): Indicate whether to calculate in the reverse + order of input sequences. Defaults to False. + **kwargs: Additional keyword arguments to pass to `forward` of the cell. + + Returns: + outputs (Tensor|list|tuple): the output sequence. Tensor or nested + structure of Tensors. + If `time_major` is True, the shape of each tensor in outpus is + `[time_steps, batch_size, hidden_size]`, else + `[batch_size, time_steps, hidden_size]`. + final_states (Tensor|list|tuple): final states. A (possibly nested structure of) + tensor[s], representing the final state for RNN. It has the same + structure of intial state. Each tensor in final states has the same + shape and dtype as the corresponding tensor in initial states. + + Examples: + + .. code-block:: python + + import paddle + paddle.disable_static() + + cell = paddle.nn.SimpleRNNCell(16, 32) + + inputs = paddle.rand((4, 23, 16)) + prev_h = paddle.randn((4, 32)) + outputs, final_states = paddle.nn.layer.rnn(cell, inputs, prev_h) + + """ + + if _non_static_mode(): + return _rnn_dynamic_graph( + cell, + inputs, + initial_states, + sequence_length, + time_major, + is_reverse, + **kwargs + ) + else: + return _rnn_static_graph( + cell, + inputs, + initial_states, + sequence_length, + time_major, + is_reverse, + **kwargs + ) + + +class ArrayWrapper: + def __init__(self, x): + self.array = [x] + + def append(self, x): + self.array.append(x) + return self + + def __getitem__(self, item): + return self.array.__getitem__(item) + + +def _maybe_copy(state, new_state, step_mask): + """update rnn state or just pass the old state through""" + new_state = paddle.tensor.math._multiply_with_axis( + new_state, step_mask, axis=0 + ) + paddle.tensor.math._multiply_with_axis(state, (1 - step_mask), axis=0) + return new_state + + +def _transpose_batch_time(x): + perm = [1, 0] + list(range(2, len(x.shape))) + return paddle.transpose(x, perm) + + +def _rnn_dynamic_graph( + cell, + inputs, + initial_states=None, + sequence_length=None, + time_major=False, + is_reverse=False, + **kwargs +): + time_step_index = 0 if time_major else 1 + flat_inputs = flatten(inputs) + time_steps = flat_inputs[0].shape[time_step_index] + + if initial_states is None: + initial_states = cell.get_initial_states( + batch_ref=inputs, batch_dim_idx=1 if time_major else 0 + ) + + if not time_major: + inputs = map_structure(_transpose_batch_time, inputs) + + if sequence_length is not None: + mask = sequence_lod.sequence_mask( + sequence_length, maxlen=time_steps, dtype=inputs.dtype + ) + mask = paddle.transpose(mask, [1, 0]) + + if is_reverse: + inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs) + mask = ( + paddle.reverse(mask, axis=[0]) + if sequence_length is not None + else None + ) + + states = initial_states + outputs = [] + for i in range(time_steps): + step_inputs = map_structure(lambda x: x[i], inputs) + step_outputs, new_states = cell(step_inputs, states, **kwargs) + if sequence_length is not None: + new_states = map_structure( + partial(_maybe_copy, step_mask=mask[i]), states, new_states + ) + states = new_states + outputs = ( + map_structure(lambda x: ArrayWrapper(x), step_outputs) + if i == 0 + else map_structure( + lambda x, x_array: x_array.append(x), step_outputs, outputs + ) + ) + + final_outputs = map_structure( + lambda x: paddle.stack(x.array, axis=time_step_index), outputs + ) + + if is_reverse: + final_outputs = map_structure( + lambda x: paddle.reverse(x, axis=time_step_index), final_outputs + ) + + final_states = new_states + return final_outputs, final_states + + +def _rnn_static_graph( + cell, + inputs, + initial_states=None, + sequence_length=None, + time_major=False, + is_reverse=False, + **kwargs +): + check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn') + if isinstance(inputs, (list, tuple)): + for i, input_x in enumerate(inputs): + check_variable_and_dtype( + input_x, 'inputs[' + str(i) + ']', ['float32', 'float64'], 'rnn' + ) + check_type( + initial_states, + 'initial_states', + (Variable, list, tuple, type(None)), + 'rnn', + ) + + check_type( + sequence_length, 'sequence_length', (Variable, type(None)), 'rnn' + ) + + def _switch_grad(x, stop=False): + x.stop_gradient = stop + return x + + if initial_states is None: + initial_states = cell.get_initial_states( + batch_ref=inputs, batch_dim_idx=1 if time_major else 0 + ) + initial_states = map_structure(_switch_grad, initial_states) + + if not time_major: + inputs = map_structure(_transpose_batch_time, inputs) + + if sequence_length: + max_seq_len = paddle.shape(flatten(inputs)[0])[0] + mask = sequence_lod.sequence_mask( + sequence_length, + maxlen=max_seq_len, + dtype=flatten(initial_states)[0].dtype, + ) + mask = paddle.transpose(mask, [1, 0]) + if is_reverse: + inputs = map_structure(lambda x: paddle.reverse(x, axis=[0]), inputs) + mask = paddle.reverse(mask, axis=[0]) if sequence_length else None + + # StaticRNN + rnn = control_flow.StaticRNN() + with rnn.step(): + inputs = map_structure(rnn.step_input, inputs) + states = map_structure(rnn.memory, initial_states) + copy_states = map_structure(lambda x: x, states) + outputs, new_states = cell(inputs, copy_states, **kwargs) + utils.assert_same_structure(states, new_states) + if sequence_length: + step_mask = rnn.step_input(mask) + new_states = map_structure( + partial(_maybe_copy, step_mask=step_mask), states, new_states + ) + + map_structure(rnn.update_memory, states, new_states) + flat_outputs = flatten(outputs) + map_structure(rnn.step_output, outputs) + map_structure(rnn.step_output, new_states) + + rnn_out = rnn() + final_outputs = rnn_out[: len(flat_outputs)] + final_outputs = utils.pack_sequence_as(outputs, final_outputs) + final_states = map_structure(lambda x: x[-1], rnn_out[len(flat_outputs) :]) + final_states = utils.pack_sequence_as(new_states, final_states) + + if is_reverse: + final_outputs = map_structure( + lambda x: paddle.reverse(x, axis=[0]), final_outputs + ) + + if not time_major: + final_outputs = map_structure(_transpose_batch_time, final_outputs) + + return (final_outputs, final_states) + + +def birnn( + cell_fw, + cell_bw, + inputs, + initial_states=None, + sequence_length=None, + time_major=False, + **kwargs +): + r""" + birnn creates a bidirectional recurrent neural network specified by + RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` + (for dygraph mode :code:`cell.forward`) repeatedly until reaches to + the maximum length of `inputs` and then concat the outputs for both RNNs + along the last axis. + + Parameters: + cell_fw(RNNCellBase): An instance of `RNNCellBase`. + cell_bw(RNNCellBase): An instance of `RNNCellBase`. + inputs(Tensor): the input sequences. + If time_major is True, the shape is + `[time_steps, batch_size, input_size]` + else the shape is `[batch_size, time_steps, input_size]`. + initial_states(tuple, optional): A tuple of initial states of + `cell_fw` and `cell_bw`. + If not provided, `cell.get_initial_states` would be called to + produce initial state for each cell. Defaults to None. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. Defaults to None. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whose time step + index are not less than the valid length are treated as paddings. + time_major (bool): Whether the first dimension of the input means the + time steps. Defaults to False. + **kwargs: Additional keyword arguments to pass to `forward` of each cell. + + Returns: + outputs (Tensor): the outputs of the bidirectional RNN. It is the + concatenation of the outputs from the forward RNN and backward + RNN along the last axis. + If time major is True, the shape is `[time_steps, batch_size, size]`, + else the shape is `[batch_size, time_steps, size]`, where size is + `cell_fw.hidden_size + cell_bw.hidden_size`. + final_states (tuple): A tuple of the final states of the forward + cell and backward cell. + + Examples: + + .. code-block:: python + + import paddle + paddle.disable_static() + + cell_fw = paddle.nn.LSTMCell(16, 32) + cell_bw = paddle.nn.LSTMCell(16, 32) + + inputs = paddle.rand((4, 23, 16)) + hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32)) + hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32)) + initial_states = ((hf, cf), (hb, cb)) + outputs, final_states = paddle.nn.layer.birnn( + cell_fw, cell_bw, inputs, initial_states) + + """ + + if initial_states is None: + states_fw = cell_fw.get_initial_states( + batch_ref=inputs, batch_dim_idx=1 if time_major else 0 + ) + states_bw = cell_fw.get_initial_states( + batch_ref=inputs, batch_dim_idx=1 if time_major else 0 + ) + else: + states_fw, states_bw = initial_states + outputs_fw, states_fw = rnn( + cell_fw, + inputs, + states_fw, + sequence_length, + time_major=time_major, + **kwargs + ) + + outputs_bw, states_bw = rnn( + cell_bw, + inputs, + states_bw, + sequence_length, + time_major=time_major, + is_reverse=True, + **kwargs + ) + + outputs = map_structure( + lambda x, y: paddle.concat([x, y], -1), outputs_fw, outputs_bw + ) + + final_states = (states_fw, states_bw) + return outputs, final_states + + def split_states(states, bidirectional=False, state_components=1): r""" Split states of RNN network into possibly nested list or tuple of @@ -779,7 +1142,7 @@ class RNN(Layer): def forward( self, inputs, initial_states=None, sequence_length=None, **kwargs ): - final_outputs, final_states = paddle.fluid.layers.rnn( + final_outputs, final_states = rnn( self.cell, inputs, initial_states=initial_states, @@ -866,7 +1229,7 @@ class BiRNN(Layer): len(initial_states) == 2 ), "length of initial_states should be 2 when it is a list/tuple" - outputs, final_states = paddle.fluid.layers.birnn( + outputs, final_states = birnn( self.cell_fw, self.cell_bw, inputs,