未验证 提交 f79526c2 编写于 作者: X Xing Wu 提交者: GitHub

Fix dygraph rnn doc (#24148)

* update dygraph.rnn import

* update dygraph.rnn import

* change unit to cell

* fix math equations

* fix math equations

* fix examples

* remove unused import

* fix examples
上级 fff5cfa3
...@@ -50,6 +50,9 @@ from .static_runner import StaticModelRunner ...@@ -50,6 +50,9 @@ from .static_runner import StaticModelRunner
from . import dygraph_to_static from . import dygraph_to_static
from .dygraph_to_static import ProgramTranslator from .dygraph_to_static import ProgramTranslator
from . import rnn
from .rnn import *
__all__ = [] __all__ = []
__all__ += layers.__all__ __all__ += layers.__all__
__all__ += base.__all__ __all__ += base.__all__
...@@ -60,4 +63,5 @@ __all__ += checkpoint.__all__ ...@@ -60,4 +63,5 @@ __all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__ __all__ += learning_rate_scheduler.__all__
__all__ += backward_strategy.__all__ __all__ += backward_strategy.__all__
__all__ += jit.__all__ __all__ += jit.__all__
__all__ += rnn.__all__
__all__ += ['ProgramTranslator'] __all__ += ['ProgramTranslator']
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .layers import Layer from . import Layer
from paddle.fluid import layers from ..layers import sigmoid, tanh, concat, fill_constant, matmul, elementwise_add, elementwise_mul, split
import copy import copy
__all__ = ['LSTMCell', 'GRUCell'] __all__ = ['LSTMCell', 'GRUCell']
...@@ -24,32 +24,49 @@ class LSTMCell(Layer): ...@@ -24,32 +24,49 @@ class LSTMCell(Layer):
LSTMCell implementation using basic operators. LSTMCell implementation using basic operators.
There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation. There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation.
The algorithm can be described as the equations below. The algorithm can be described as the equations below.
.. math:: .. math::
i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
\\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t}
h_t &= o_t \\odot tanh(c_t) h_t &= o_t \\odot tanh(c_t)
The other LSTMCell version is compatible with the BasicLSTMUnit used in static graph. The other LSTMCell version is compatible with the BasicLSTMUnit used in static graph.
The algorithm can be described as the equations below. The algorithm can be described as the equations below.
.. math::
i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i) i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias ) f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o) o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
\\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t}
h_t &= o_t \\odot tanh(c_t) h_t &= o_t \\odot tanh(c_t)
Args: Args:
hidden_size (integer): The hidden size used in the Cell. hidden_size (integer): The hidden size used in the Cell.
input_size (integer): The input size used in the Cell. input_size (integer): The input size used in the Cell.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
weight matrix. Note: weight matrix. Note:
If it is set to None or one attribute of ParamAttr, lstm_unit will If it is set to None or one attribute of ParamAttr, LSTMCell will
create ParamAttr as param_attr. If the Initializer of the param_attr create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None. is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The parameter attribute for the bias bias_attr (ParamAttr|None): The parameter attribute for the bias
of LSTM unit. of LSTMCell.
If it is set to None or one attribute of ParamAttr, lstm_unit will If it is set to None or one attribute of ParamAttr, LSTMCell will
create ParamAttr as bias_attr. If the Initializer of the bias_attr create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized as zero. Default: None. is not set, the bias is initialized as zero. Default: None.
gate_activation (function|None): The activation function for gates (actGate). gate_activation (function|None): The activation function for gates (actGate).
...@@ -59,15 +76,18 @@ class LSTMCell(Layer): ...@@ -59,15 +76,18 @@ class LSTMCell(Layer):
forget_bias(float|1.0): forget bias used when computing forget gate. This forget_bias(float|1.0): forget bias used when computing forget gate. This
is not used in default LSTMCell implementation (CUDNN compatiable) is not used in default LSTMCell implementation (CUDNN compatiable)
use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
dtype(string): data type used in this unit dtype(string): data type used in this cell
Returns: Returns:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
from paddle import fluid from paddle import fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.dygraph.rnn import LSTMCell from paddle.fluid.dygraph import LSTMCell
import numpy as np import numpy as np
batch_size = 64 batch_size = 64
input_size = 128 input_size = 128
...@@ -88,6 +108,7 @@ class LSTMCell(Layer): ...@@ -88,6 +108,7 @@ class LSTMCell(Layer):
pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
pre_cell_var = fluid.dygraph.to_variable(pre_cell_np) pre_cell_var = fluid.dygraph.to_variable(pre_cell_np)
new_hidden, new_cell = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var) new_hidden, new_cell = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var)
""" """
def __init__(self, def __init__(self,
...@@ -107,8 +128,8 @@ class LSTMCell(Layer): ...@@ -107,8 +128,8 @@ class LSTMCell(Layer):
self._param_attr = param_attr self._param_attr = param_attr
self._bias_attr = bias_attr self._bias_attr = bias_attr
self._dtype = dtype self._dtype = dtype
self._gate_activation = gate_activation or layers.sigmoid self._gate_activation = gate_activation or sigmoid
self._activation = activation or layers.tanh self._activation = activation or tanh
self._use_cudnn_impl = use_cudnn_impl self._use_cudnn_impl = use_cudnn_impl
if self._use_cudnn_impl: if self._use_cudnn_impl:
...@@ -154,7 +175,7 @@ class LSTMCell(Layer): ...@@ -154,7 +175,7 @@ class LSTMCell(Layer):
else: else:
self._forget_bias = layers.fill_constant( self._forget_bias = fill_constant(
[1], dtype=dtype, value=forget_bias) [1], dtype=dtype, value=forget_bias)
self._forget_bias.stop_gradient = False self._forget_bias.stop_gradient = False
...@@ -174,29 +195,24 @@ class LSTMCell(Layer): ...@@ -174,29 +195,24 @@ class LSTMCell(Layer):
def forward(self, input, pre_hidden, pre_cell): def forward(self, input, pre_hidden, pre_cell):
if self._use_cudnn_impl: if self._use_cudnn_impl:
igates = layers.matmul(input, y=self._weight_ih, transpose_y=True) igates = matmul(input, y=self._weight_ih, transpose_y=True)
igates = layers.elementwise_add(igates, self._bias_ih) igates = elementwise_add(igates, self._bias_ih)
hgates = layers.matmul( hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
pre_hidden, self._weight_hh, transpose_y=True) hgates = elementwise_add(hgates, self._bias_hh)
hgates = layers.elementwise_add(hgates, self._bias_hh)
chunked_igates = layers.split(igates, num_or_sections=4, dim=1) chunked_igates = split(igates, num_or_sections=4, dim=1)
chunked_hgates = layers.split(hgates, num_or_sections=4, dim=1) chunked_hgates = split(hgates, num_or_sections=4, dim=1)
ingate = layers.elementwise_add(chunked_igates[0], ingate = elementwise_add(chunked_igates[0], chunked_hgates[0])
chunked_hgates[0])
ingate = self._gate_activation(ingate) ingate = self._gate_activation(ingate)
forgetgate = layers.elementwise_add(chunked_igates[1], forgetgate = elementwise_add(chunked_igates[1], chunked_hgates[1])
chunked_hgates[1])
forgetgate = self._gate_activation(forgetgate) forgetgate = self._gate_activation(forgetgate)
cellgate = layers.elementwise_add(chunked_igates[2], cellgate = elementwise_add(chunked_igates[2], chunked_hgates[2])
chunked_hgates[2])
cellgate = self._activation(cellgate) cellgate = self._activation(cellgate)
outgate = layers.elementwise_add(chunked_igates[3], outgate = elementwise_add(chunked_igates[3], chunked_hgates[3])
chunked_hgates[3])
outgate = self._gate_activation(outgate) outgate = self._gate_activation(outgate)
new_cell = (forgetgate * pre_cell) + (ingate * cellgate) new_cell = (forgetgate * pre_cell) + (ingate * cellgate)
...@@ -204,17 +220,16 @@ class LSTMCell(Layer): ...@@ -204,17 +220,16 @@ class LSTMCell(Layer):
else: else:
concat_input_hidden = layers.concat([input, pre_hidden], 1) concat_input_hidden = concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = matmul(x=concat_input_hidden, y=self._weight)
gate_input = layers.elementwise_add(gate_input, self._bias) gate_input = elementwise_add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
new_cell = layers.elementwise_add( new_cell = elementwise_add(
layers.elementwise_mul( elementwise_mul(pre_cell,
pre_cell, self._gate_activation(
self._gate_activation( elementwise_add(f, self._forget_bias))),
layers.elementwise_add(f, self._forget_bias))), elementwise_mul(sigmoid(i), tanh(j)))
layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
new_hidden = self._activation(new_cell) * self._gate_activation(o) new_hidden = self._activation(new_cell) * self._gate_activation(o)
return new_hidden, new_cell return new_hidden, new_cell
...@@ -225,28 +240,41 @@ class GRUCell(Layer): ...@@ -225,28 +240,41 @@ class GRUCell(Layer):
GRU implementation using basic operators. GRU implementation using basic operators.
There are two GRUCell version, the default one is compatible with CUDNN GRU implementation. There are two GRUCell version, the default one is compatible with CUDNN GRU implementation.
The algorithm can be described as the equations below. The algorithm can be described as the equations below.
.. math:: .. math::
u_t & = sigmoid(W_{ux} x_{t} + b_ux + W_{uh} h_{t-1} + b_uh) u_t & = sigmoid(W_{ux} x_{t} + b_ux + W_{uh} h_{t-1} + b_uh)
r_t & = sigmoid(W_{rx} x_{t} + b_rx + W_{rh} h_{t-1} + b_rh) r_t & = sigmoid(W_{rx} x_{t} + b_rx + W_{rh} h_{t-1} + b_rh)
\\tilde{h_{t}} & = tanh(W_{cx} x_{t} + b_cx + r_t \\odot (W_{ch} h_{t-1} + b_ch)) \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + b_cx + r_t \\odot (W_{ch} h_{t-1} + b_ch))
h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}} h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}}
The other LSTMCell version is compatible with the BasicGRUUnit used in static graph. The other LSTMCell version is compatible with the BasicGRUUnit used in static graph.
The algorithm can be described as the equations below. The algorithm can be described as the equations below.
.. math::
u_t & = sigmoid(W_{ux} x_{t} + W_{uh} h_{t-1} + b_u) u_t & = sigmoid(W_{ux} x_{t} + W_{uh} h_{t-1} + b_u)
r_t & = sigmoid(W_{rx} x_{t} + W_{rh} h_{t-1} + b_r) r_t & = sigmoid(W_{rx} x_{t} + W_{rh} h_{t-1} + b_r)
\\tilde{h_{t}} & = tanh(W_{cx} x_{t} + W_{ch} \\odot(r_t, h_{t-1}) + b_m) \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + W_{ch} \\odot(r_t, h_{t-1}) + b_m)
h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}} h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}}
Args: Args:
hidden_size (integer): The hidden size used in the Cell. hidden_size (integer): The hidden size used in the Cell.
input_size (integer): The input size used in the Cell. input_size (integer): The input size used in the Cell.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
weight matrix. Note: weight matrix. Note:
If it is set to None or one attribute of ParamAttr, gru_unit will If it is set to None or one attribute of ParamAttr, GRUCell will
create ParamAttr as param_attr. If the Initializer of the param_attr create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None. is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The parameter attribute for the bias bias_attr (ParamAttr|None): The parameter attribute for the bias
of GRU unit. of GRUCell.
If it is set to None or one attribute of ParamAttr, gru_unit will If it is set to None or one attribute of ParamAttr, GRUCell will
create ParamAttr as bias_attr. If the Initializer of the bias_attr create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None. is not set, the bias is initialized zero. Default: None.
gate_activation (function|None): The activation function for gates (actGate). gate_activation (function|None): The activation function for gates (actGate).
...@@ -254,15 +282,18 @@ class GRUCell(Layer): ...@@ -254,15 +282,18 @@ class GRUCell(Layer):
activation (function|None): The activation function for cell (actNode). activation (function|None): The activation function for cell (actNode).
Default: 'fluid.layers.tanh' Default: 'fluid.layers.tanh'
use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
dtype(string): data type used in this unit dtype(string): data type used in this cell
Returns: Returns:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
from paddle import fluid from paddle import fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.dygraph.rnn import GRUCell from paddle.fluid.dygraph import GRUCell
import numpy as np import numpy as np
batch_size = 64 batch_size = 64
input_size = 128 input_size = 128
...@@ -279,6 +310,7 @@ class GRUCell(Layer): ...@@ -279,6 +310,7 @@ class GRUCell(Layer):
cudnn_gru = GRUCell(hidden_size, input_size) cudnn_gru = GRUCell(hidden_size, input_size)
step_input_var = fluid.dygraph.to_variable(step_input_np) step_input_var = fluid.dygraph.to_variable(step_input_np)
pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
""" """
def __init__(self, def __init__(self,
...@@ -297,8 +329,8 @@ class GRUCell(Layer): ...@@ -297,8 +329,8 @@ class GRUCell(Layer):
self._param_attr = param_attr self._param_attr = param_attr
self._bias_attr = bias_attr self._bias_attr = bias_attr
self._dtype = dtype self._dtype = dtype
self._gate_activation = gate_activation or layers.sigmoid self._gate_activation = gate_activation or sigmoid
self._activation = activation or layers.tanh self._activation = activation or tanh
self._use_cudnn_impl = use_cudnn_impl self._use_cudnn_impl = use_cudnn_impl
if self._use_cudnn_impl: if self._use_cudnn_impl:
...@@ -391,45 +423,41 @@ class GRUCell(Layer): ...@@ -391,45 +423,41 @@ class GRUCell(Layer):
if self._use_cudnn_impl: if self._use_cudnn_impl:
igates = layers.matmul(input, y=self._weight_ih, transpose_y=True) igates = matmul(input, y=self._weight_ih, transpose_y=True)
igates = layers.elementwise_add(igates, self._bias_ih) igates = elementwise_add(igates, self._bias_ih)
hgates = layers.matmul( hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
pre_hidden, self._weight_hh, transpose_y=True) hgates = elementwise_add(hgates, self._bias_hh)
hgates = layers.elementwise_add(hgates, self._bias_hh)
chunked_igates = layers.split(igates, num_or_sections=3, dim=1) chunked_igates = split(igates, num_or_sections=3, dim=1)
chunked_hgates = layers.split(hgates, num_or_sections=3, dim=1) chunked_hgates = split(hgates, num_or_sections=3, dim=1)
reset_gate = layers.elementwise_add(chunked_igates[0], reset_gate = elementwise_add(chunked_igates[0], chunked_hgates[0])
chunked_hgates[0])
reset_gate = self._gate_activation(reset_gate) reset_gate = self._gate_activation(reset_gate)
input_gate = layers.elementwise_add(chunked_igates[1], input_gate = elementwise_add(chunked_igates[1], chunked_hgates[1])
chunked_hgates[1])
input_gate = self._gate_activation(input_gate) input_gate = self._gate_activation(input_gate)
_temp = reset_gate * chunked_hgates[2] _temp = reset_gate * chunked_hgates[2]
new_gate = layers.elementwise_add(chunked_igates[2], _temp) new_gate = elementwise_add(chunked_igates[2], _temp)
new_gate = self._activation(new_gate) new_gate = self._activation(new_gate)
new_hidden = (pre_hidden - new_gate) * input_gate + new_gate new_hidden = (pre_hidden - new_gate) * input_gate + new_gate
else: else:
concat_input_hidden = layers.concat([input, pre_hidden], 1) concat_input_hidden = concat([input, pre_hidden], 1)
gate_input = layers.matmul( gate_input = matmul(x=concat_input_hidden, y=self._gate_weight)
x=concat_input_hidden, y=self._gate_weight)
gate_input = layers.elementwise_add(gate_input, self._gate_bias) gate_input = elementwise_add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input) gate_input = self._gate_activation(gate_input)
r, u = layers.split(gate_input, num_or_sections=2, dim=1) r, u = split(gate_input, num_or_sections=2, dim=1)
r_hidden = r * pre_hidden r_hidden = r * pre_hidden
candidate = layers.matmul( candidate = matmul(
layers.concat([input, r_hidden], 1), self._candidate_weight) concat([input, r_hidden], 1), self._candidate_weight)
candidate = layers.elementwise_add(candidate, self._candidate_bias) candidate = elementwise_add(candidate, self._candidate_bias)
c = self._activation(candidate) c = self._activation(candidate)
new_hidden = u * pre_hidden + (1 - u) * c new_hidden = u * pre_hidden + (1 - u) * c
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.dygraph.rnn import GRUCell from paddle.fluid.dygraph import GRUCell
import numpy as np import numpy as np
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.dygraph.rnn import LSTMCell from paddle.fluid.dygraph import LSTMCell
import numpy as np import numpy as np
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册