From 840ac2b3021599f289e3940f273dca92e090666c Mon Sep 17 00:00:00 2001 From: Xing Wu Date: Tue, 14 Apr 2020 14:16:06 +0800 Subject: [PATCH] Cudnn rnn layers api (#23390) * add cudnn compatiable rnn cell api for dygraph * update sample code * update some typos * fix specify name in param_attr problem * add pre-commit check * remove duplicate import, test=develop * add unittest coverage, test=develop * make code more tight, test=develop * cudnn_compatibale -> use_cudnn_impl, test=develop * change api name, test=develop --- python/paddle/fluid/dygraph/rnn.py | 447 ++++++++++++++++++ .../tests/unittests/test_cudnn_grucell.py | 233 +++++++++ .../tests/unittests/test_cudnn_lstmcell.py | 254 ++++++++++ 3 files changed, 934 insertions(+) create mode 100644 python/paddle/fluid/dygraph/rnn.py create mode 100644 python/paddle/fluid/tests/unittests/test_cudnn_grucell.py create mode 100644 python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py new file mode 100644 index 0000000000..42fdd82b81 --- /dev/null +++ b/python/paddle/fluid/dygraph/rnn.py @@ -0,0 +1,447 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .layers import Layer +from paddle.fluid import layers +import copy + +__all__ = ['LSTMCell', 'GRUCell'] + + +class LSTMCell(Layer): + """ + LSTMCell implementation using basic operators. + There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation. + The algorithm can be described as the equations below. + .. math:: + i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) + f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) + o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) + \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) + c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} + h_t &= o_t \\odot tanh(c_t) + The other LSTMCell version is compatible with the BasicLSTMUnit used in static graph. + The algorithm can be described as the equations below. + i_t &= sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i) + f_t &= sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias ) + o_t &= sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o) + \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c) + c_t &= f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} + h_t &= o_t \\odot tanh(c_t) + + Args: + hidden_size (integer): The hidden size used in the Cell. + input_size (integer): The input size used in the Cell. + param_attr(ParamAttr|None): The parameter attribute for the learnable + weight matrix. Note: + If it is set to None or one attribute of ParamAttr, lstm_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The parameter attribute for the bias + of LSTM unit. + If it is set to None or one attribute of ParamAttr, lstm_unit will + create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized as zero. Default: None. + gate_activation (function|None): The activation function for gates (actGate). + Default: 'fluid.layers.sigmoid' + activation (function|None): The activation function for cells (actNode). + Default: 'fluid.layers.tanh' + forget_bias(float|1.0): forget bias used when computing forget gate. This + is not used in default LSTMCell implementation (CUDNN compatiable) + use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell + dtype(string): data type used in this unit + + Returns: + None + + Examples: + .. code-block:: python + from paddle import fluid + import paddle.fluid.core as core + from paddle.fluid.dygraph.rnn import LSTMCell + import numpy as np + + batch_size = 64 + input_size = 128 + hidden_size = 256 + + step_input_np = np.random.uniform(-0.1, 0.1, ( + batch_size, input_size)).astype('float64') + pre_hidden_np = np.random.uniform(-0.1, 0.1, ( + batch_size, hidden_size)).astype('float64') + pre_cell_np = np.random.uniform(-0.1, 0.1, ( + batch_size, hidden_size)).astype('float64') + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with fluid.dygraph.guard(place): + cudnn_lstm = LSTMCell(hidden_size, input_size) + step_input_var = fluid.dygraph.to_variable(step_input_np) + pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) + pre_cell_var = fluid.dygraph.to_variable(pre_cell_np) + new_hidden, new_cell = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var) + """ + + def __init__(self, + hidden_size, + input_size, + param_attr=None, + bias_attr=None, + gate_activation=None, + activation=None, + forget_bias=1.0, + use_cudnn_impl=True, + dtype='float64'): + super(LSTMCell, self).__init__(dtype) + + self._hidden_size = hidden_size + self._input_size = input_size + self._param_attr = param_attr + self._bias_attr = bias_attr + self._dtype = dtype + self._gate_activation = gate_activation or layers.sigmoid + self._activation = activation or layers.tanh + self._use_cudnn_impl = use_cudnn_impl + + if self._use_cudnn_impl: + + if self._param_attr is not None and self._param_attr.name is not None: + weight_ih_param_attr = copy.deepcopy(self._param_attr) + weight_hh_param_attr = copy.deepcopy(self._param_attr) + weight_ih_param_attr.name += "_weight_ih" + weight_hh_param_attr.name += "_weight_hh" + else: + weight_ih_param_attr = self._param_attr + weight_hh_param_attr = self._param_attr + + if self._bias_attr is not None and self._bias_attr.name is not None: + bias_ih_param_attr = copy.deepcopy(self._bias_attr) + bias_hh_param_attr = copy.deepcopy(self._bias_attr) + bias_ih_param_attr.name += "_bias_ih" + bias_hh_param_attr.name += "_bias_hh" + else: + bias_ih_param_attr = self._bias_attr + bias_hh_param_attr = self._bias_attr + + self._weight_ih = self.create_parameter( + attr=weight_ih_param_attr, + shape=[self._input_size, 4 * self._hidden_size], + dtype=self._dtype) + + self._weight_hh = self.create_parameter( + attr=weight_hh_param_attr, + shape=[self._hidden_size, 4 * self._hidden_size], + dtype=self._dtype) + + self._bias_ih = self.create_parameter( + attr=bias_ih_param_attr, + shape=[4 * self._hidden_size], + dtype=self._dtype, + is_bias=True) + self._bias_hh = self.create_parameter( + attr=bias_hh_param_attr, + shape=[4 * self._hidden_size], + dtype=self._dtype, + is_bias=True) + + else: + + self._forget_bias = layers.fill_constant( + [1], dtype=dtype, value=forget_bias) + self._forget_bias.stop_gradient = False + + self._weight = self.create_parameter( + attr=self._param_attr, + shape=[ + self._input_size + self._hidden_size, 4 * self._hidden_size + ], + dtype=dtype) + + self._bias = self.create_parameter( + attr=self._bias_attr, + shape=[4 * self._hidden_size], + dtype=dtype, + is_bias=True) + + def forward(self, input, pre_hidden, pre_cell): + + if self._use_cudnn_impl: + + igates = layers.matmul(input, y=self._weight_ih) + igates = layers.elementwise_add(igates, self._bias_ih) + hgates = layers.matmul(pre_hidden, self._weight_hh) + hgates = layers.elementwise_add(hgates, self._bias_hh) + + chunked_igates = layers.split(igates, num_or_sections=4, dim=1) + chunked_hgates = layers.split(hgates, num_or_sections=4, dim=1) + + ingate = layers.elementwise_add(chunked_igates[0], + chunked_hgates[0]) + ingate = self._gate_activation(ingate) + + forgetgate = layers.elementwise_add(chunked_igates[1], + chunked_hgates[1]) + forgetgate = self._gate_activation(forgetgate) + + cellgate = layers.elementwise_add(chunked_igates[2], + chunked_hgates[2]) + cellgate = self._activation(cellgate) + + outgate = layers.elementwise_add(chunked_igates[3], + chunked_hgates[3]) + outgate = self._gate_activation(outgate) + + new_cell = (forgetgate * pre_cell) + (ingate * cellgate) + new_hidden = outgate * self._activation(new_cell) + + else: + + concat_input_hidden = layers.concat([input, pre_hidden], 1) + gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) + + gate_input = layers.elementwise_add(gate_input, self._bias) + i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) + new_cell = layers.elementwise_add( + layers.elementwise_mul( + pre_cell, + self._gate_activation( + layers.elementwise_add(f, self._forget_bias))), + layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j))) + new_hidden = self._activation(new_cell) * self._gate_activation(o) + + return new_hidden, new_cell + + +class GRUCell(Layer): + """ + GRU implementation using basic operators. + There are two GRUCell version, the default one is compatible with CUDNN GRU implementation. + The algorithm can be described as the equations below. + .. math:: + u_t & = sigmoid(W_{ux} x_{t} + b_ux + W_{uh} h_{t-1} + b_uh) + r_t & = sigmoid(W_{rx} x_{t} + b_rx + W_{rh} h_{t-1} + b_rh) + \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + b_cx + r_t \\odot (W_{ch} h_{t-1} + b_ch)) + h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}} + The other LSTMCell version is compatible with the BasicGRUUnit used in static graph. + The algorithm can be described as the equations below. + u_t & = sigmoid(W_{ux} x_{t} + W_{uh} h_{t-1} + b_u) + r_t & = sigmoid(W_{rx} x_{t} + W_{rh} h_{t-1} + b_r) + \\tilde{h_{t}} & = tanh(W_{cx} x_{t} + W_{ch} \\odot(r_t, h_{t-1}) + b_m) + h_t & = u_t h_{t-1} + (1-u_t) \\tilde{h_{t}} + Args: + hidden_size (integer): The hidden size used in the Cell. + input_size (integer): The input size used in the Cell. + param_attr(ParamAttr|None): The parameter attribute for the learnable + weight matrix. Note: + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The parameter attribute for the bias + of GRU unit. + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + gate_activation (function|None): The activation function for gates (actGate). + Default: 'fluid.layers.sigmoid' + activation (function|None): The activation function for cell (actNode). + Default: 'fluid.layers.tanh' + use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell + dtype(string): data type used in this unit + + Returns: + None + + Examples: + .. code-block:: python + from paddle import fluid + import paddle.fluid.core as core + from paddle.fluid.dygraph.rnn import GRUCell + import numpy as np + + batch_size = 64 + input_size = 128 + hidden_size = 256 + + step_input_np = np.random.uniform(-0.1, 0.1, ( + batch_size, input_size)).astype('float64') + pre_hidden_np = np.random.uniform(-0.1, 0.1, ( + batch_size, hidden_size)).astype('float64') + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with fluid.dygraph.guard(place): + cudnn_gru = GRUCell(hidden_size, input_size) + step_input_var = fluid.dygraph.to_variable(step_input_np) + pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) + """ + + def __init__(self, + hidden_size, + input_size, + param_attr=None, + bias_attr=None, + gate_activation=None, + activation=None, + use_cudnn_impl=True, + dtype='float64'): + super(GRUCell, self).__init__() + + self._hidden_size = hidden_size + self._input_size = input_size + self._param_attr = param_attr + self._bias_attr = bias_attr + self._dtype = dtype + self._gate_activation = gate_activation or layers.sigmoid + self._activation = activation or layers.tanh + self._use_cudnn_impl = use_cudnn_impl + + if self._use_cudnn_impl: + + if self._param_attr is not None and self._param_attr.name is not None: + weight_ih_param_attr = copy.deepcopy(self._param_attr) + weight_hh_param_attr = copy.deepcopy(self._param_attr) + weight_ih_param_attr.name += "_weight_ih" + weight_hh_param_attr.name += "_weight_hh" + else: + weight_ih_param_attr = self._param_attr + weight_hh_param_attr = self._param_attr + + if self._bias_attr is not None and self._bias_attr.name is not None: + bias_ih_param_attr = copy.deepcopy(self._bias_attr) + bias_hh_param_attr = copy.deepcopy(self._bias_attr) + bias_ih_param_attr.name += "_bias_ih" + bias_hh_param_attr.name += "_bias_hh" + else: + bias_ih_param_attr = self._bias_attr + bias_hh_param_attr = self._bias_attr + + self._weight_ih = self.create_parameter( + attr=weight_ih_param_attr, + shape=[self._input_size, 3 * self._hidden_size], + dtype=self._dtype) + + self._weight_hh = self.create_parameter( + attr=weight_hh_param_attr, + shape=[self._hidden_size, 3 * self._hidden_size], + dtype=self._dtype) + + self._bias_ih = self.create_parameter( + attr=bias_ih_param_attr, + shape=[3 * self._hidden_size], + dtype=self._dtype, + is_bias=True) + self._bias_hh = self.create_parameter( + attr=bias_hh_param_attr, + shape=[3 * self._hidden_size], + dtype=self._dtype, + is_bias=True) + + else: + + if self._param_attr is not None and self._param_attr.name is not None: + gate_weight_param_attr = copy.deepcopy(self._param_attr) + candidate_weight_param_attr = copy.deepcopy(self._param_attr) + gate_weight_param_attr.name += "_gate_weight" + candidate_weight_param_attr.name += "_candidate_weight" + else: + gate_weight_param_attr = self._param_attr + candidate_weight_param_attr = self._param_attr + + if self._bias_attr is not None and self._bias_attr.name is not None: + gate_bias_param_attr = copy.deepcopy(self._bias_attr) + candidate_bias_param_attr = copy.deepcopy(self._bias_attr) + gate_bias_param_attr.name += "_gate_bias" + candidate_bias_param_attr.name += "_candidate_bias" + else: + gate_bias_param_attr = self._bias_attr + candidate_bias_param_attr = self._bias_attr + + self._gate_weight = self.create_parameter( + attr=gate_weight_param_attr, + shape=[ + self._input_size + self._hidden_size, 2 * self._hidden_size + ], + dtype=dtype) + + self._candidate_weight = self.create_parameter( + attr=candidate_weight_param_attr, + shape=[ + self._input_size + self._hidden_size, self._hidden_size + ], + dtype=dtype) + + self._gate_bias = self.create_parameter( + attr=gate_bias_param_attr, + shape=[2 * self._hidden_size], + dtype=dtype, + is_bias=True) + self._candidate_bias = self.create_parameter( + attr=candidate_bias_param_attr, + shape=[self._hidden_size], + dtype=dtype, + is_bias=True) + + def forward(self, input, pre_hidden): + + if self._use_cudnn_impl: + + igates = layers.matmul(input, y=self._weight_ih) + igates = layers.elementwise_add(igates, self._bias_ih) + hgates = layers.matmul(pre_hidden, self._weight_hh) + hgates = layers.elementwise_add(hgates, self._bias_hh) + + chunked_igates = layers.split(igates, num_or_sections=3, dim=1) + chunked_hgates = layers.split(hgates, num_or_sections=3, dim=1) + + reset_gate = layers.elementwise_add(chunked_igates[0], + chunked_hgates[0]) + reset_gate = self._gate_activation(reset_gate) + + input_gate = layers.elementwise_add(chunked_igates[1], + chunked_hgates[1]) + input_gate = self._gate_activation(input_gate) + + _temp = reset_gate * chunked_hgates[2] + new_gate = layers.elementwise_add(chunked_igates[2], _temp) + new_gate = self._activation(new_gate) + + new_hidden = (pre_hidden - new_gate) * input_gate + new_gate + + else: + + concat_input_hidden = layers.concat([input, pre_hidden], 1) + + gate_input = layers.matmul( + x=concat_input_hidden, y=self._gate_weight) + + gate_input = layers.elementwise_add(gate_input, self._gate_bias) + gate_input = self._gate_activation(gate_input) + r, u = layers.split(gate_input, num_or_sections=2, dim=1) + + r_hidden = r * pre_hidden + + candidate = layers.matmul( + layers.concat([input, r_hidden], 1), self._candidate_weight) + candidate = layers.elementwise_add(candidate, self._candidate_bias) + + c = self._activation(candidate) + new_hidden = u * pre_hidden + (1 - u) * c + + return new_hidden diff --git a/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py b/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py new file mode 100644 index 0000000000..b3ad8b037d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cudnn_grucell.py @@ -0,0 +1,233 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.dygraph.rnn import GRUCell + +import numpy as np + +np.random.seed = 123 + + +def sigmoid(x): + return 1. / (1. + np.exp(-x)) + + +def tanh(x): + return 2. * sigmoid(2. * x) - 1. + + +def cudnn_step(step_input_np, pre_hidden_np, weight_ih, bias_ih, weight_hh, + bias_hh): + igates = np.matmul(step_input_np, weight_ih) + igates += bias_ih + hgates = np.matmul(pre_hidden_np, weight_hh) + hgates += bias_hh + + chunked_igates = np.split(igates, indices_or_sections=3, axis=1) + chunked_hgates = np.split(hgates, indices_or_sections=3, axis=1) + + reset_gate = chunked_igates[0] + chunked_hgates[0] + reset_gate = sigmoid(reset_gate) + + input_gate = chunked_igates[1] + chunked_hgates[1] + input_gate = sigmoid(input_gate) + + _temp = reset_gate * chunked_hgates[2] + new_gate = chunked_igates[2] + _temp + new_gate = tanh(new_gate) + + new_hidden = (pre_hidden_np - new_gate) * input_gate + new_gate + + return new_hidden + + +def non_cudnn_step(step_in, pre_hidden, gate_w, gate_b, candidate_w, + candidate_b): + concat_1 = np.concatenate([step_in, pre_hidden], 1) + + gate_input = np.matmul(concat_1, gate_w) + gate_input += gate_b + gate_input = sigmoid(gate_input) + r, u = np.split(gate_input, indices_or_sections=2, axis=1) + + r_hidden = r * pre_hidden + + candidate = np.matmul(np.concatenate([step_in, r_hidden], 1), candidate_w) + + candidate += candidate_b + c = tanh(candidate) + + new_hidden = u * pre_hidden + (1 - u) * c + + return new_hidden + + +class TestCudnnGRU(unittest.TestCase): + def setUp(self): + self.input_size = 100 + self.hidden_size = 200 + self.batch_size = 64 + + def test_run(self): + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with fluid.dygraph.guard(place): + param_attr = fluid.ParamAttr(name="param_attr") + bias_attr = fluid.ParamAttr(name="bias_attr") + named_cudnn_gru = GRUCell(self.hidden_size, self.input_size, + param_attr, bias_attr) + cudnn_gru = GRUCell(self.hidden_size, self.input_size) + + param_list = cudnn_gru.state_dict() + named_param_list = named_cudnn_gru.state_dict() + + # process weight and bias + + weight_ih_name = "_weight_ih" + bias_ih_name = "_bias_ih" + weight_hh_name = "_weight_hh" + bias_hh_name = "_bias_hh" + + weight_ih = param_list[weight_ih_name].numpy() + weight_ih = np.random.uniform( + -0.1, 0.1, size=weight_ih.shape).astype('float64') + param_list[weight_ih_name].set_value(weight_ih) + named_param_list[weight_ih_name].set_value(weight_ih) + + bias_ih = param_list[bias_ih_name].numpy() + bias_ih = np.random.uniform( + -0.1, 0.1, size=bias_ih.shape).astype('float64') + param_list[bias_ih_name].set_value(bias_ih) + named_param_list[bias_ih_name].set_value(bias_ih) + + weight_hh = param_list[weight_hh_name].numpy() + weight_hh = np.random.uniform( + -0.1, 0.1, size=weight_hh.shape).astype('float64') + param_list[weight_hh_name].set_value(weight_hh) + named_param_list[weight_hh_name].set_value(weight_hh) + + bias_hh = param_list[bias_hh_name].numpy() + bias_hh = np.random.uniform( + -0.1, 0.1, size=bias_hh.shape).astype('float64') + param_list[bias_hh_name].set_value(bias_hh) + named_param_list[bias_hh_name].set_value(bias_hh) + + step_input_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.input_size)).astype('float64') + pre_hidden_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.hidden_size)).astype('float64') + + step_input_var = fluid.dygraph.to_variable(step_input_np) + pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) + api_out = cudnn_gru(step_input_var, pre_hidden_var) + named_api_out = named_cudnn_gru(step_input_var, pre_hidden_var) + + np_out = cudnn_step(step_input_np, pre_hidden_np, weight_ih, bias_ih, + weight_hh, bias_hh) + + self.assertTrue(np.allclose(api_out.numpy(), np_out, rtol=1e-5, atol=0)) + self.assertTrue( + np.allclose( + named_api_out.numpy(), np_out, rtol=1e-5, atol=0)) + + +class TestNonCudnnGRU(unittest.TestCase): + def setUp(self): + self.input_size = 100 + self.hidden_size = 200 + self.batch_size = 64 + + def test_run(self): + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with fluid.dygraph.guard(place): + param_attr = fluid.ParamAttr(name="param_attr") + bias_attr = fluid.ParamAttr(name="bias_attr") + named_non_cudnn_gru = GRUCell( + self.hidden_size, + self.input_size, + param_attr, + bias_attr, + use_cudnn_impl=False) + non_cudnn_gru = GRUCell( + self.hidden_size, self.input_size, use_cudnn_impl=False) + + param_list = non_cudnn_gru.state_dict() + named_param_list = named_non_cudnn_gru.state_dict() + + # process weight and bias + + gate_w_name = "_gate_weight" + gate_b_name = "_gate_bias" + candidate_w_name = "_candidate_weight" + candidate_b_name = "_candidate_bias" + + gate_w = param_list[gate_w_name].numpy() + gate_w = np.random.uniform( + -0.1, 0.1, size=gate_w.shape).astype('float64') + param_list[gate_w_name].set_value(gate_w) + named_param_list[gate_w_name].set_value(gate_w) + + gate_b = param_list[gate_b_name].numpy() + gate_b = np.random.uniform( + -0.1, 0.1, size=gate_b.shape).astype('float64') + param_list[gate_b_name].set_value(gate_b) + named_param_list[gate_b_name].set_value(gate_b) + + candidate_w = param_list[candidate_w_name].numpy() + candidate_w = np.random.uniform( + -0.1, 0.1, size=candidate_w.shape).astype('float64') + param_list[candidate_w_name].set_value(candidate_w) + named_param_list[candidate_w_name].set_value(candidate_w) + + candidate_b = param_list[candidate_b_name].numpy() + candidate_b = np.random.uniform( + -0.1, 0.1, size=candidate_b.shape).astype('float64') + param_list[candidate_b_name].set_value(candidate_b) + named_param_list[candidate_b_name].set_value(candidate_b) + + step_input_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.input_size)).astype('float64') + pre_hidden_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.hidden_size)).astype('float64') + + step_input_var = fluid.dygraph.to_variable(step_input_np) + pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) + api_out = non_cudnn_gru(step_input_var, pre_hidden_var) + named_api_out = named_non_cudnn_gru(step_input_var, pre_hidden_var) + + np_out = non_cudnn_step(step_input_np, pre_hidden_np, gate_w, gate_b, + candidate_w, candidate_b) + + self.assertTrue(np.allclose(api_out.numpy(), np_out, rtol=1e-5, atol=0)) + self.assertTrue( + np.allclose( + named_api_out.numpy(), np_out, rtol=1e-5, atol=0)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py b/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py new file mode 100644 index 0000000000..69718f304b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cudnn_lstmcell.py @@ -0,0 +1,254 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.dygraph.rnn import LSTMCell + +import numpy as np + +np.random.seed = 123 + + +def sigmoid(x): + return 1. / (1. + np.exp(-x)) + + +def tanh(x): + return 2. * sigmoid(2. * x) - 1. + + +def cudnn_step(step_in, pre_hidden, pre_cell, gate_w, gate_b, forget_bias=1.0): + concat_1 = np.concatenate([step_in, pre_hidden], 1) + + gate_input = np.matmul(concat_1, gate_w) + gate_input += gate_b + i, j, f, o = np.split(gate_input, indices_or_sections=4, axis=1) + + new_cell = pre_cell * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j) + new_hidden = tanh(new_cell) * sigmoid(o) + + return new_hidden, new_cell + + +def non_cudnn_step(step_input_np, pre_hidden_np, pre_cell_np, weight_ih, + bias_ih, weight_hh, bias_hh): + + igates = np.matmul(step_input_np, weight_ih) + igates = igates + bias_ih + hgates = np.matmul(pre_hidden_np, weight_hh) + hgates = hgates + bias_hh + + chunked_igates = np.split(igates, indices_or_sections=4, axis=1) + chunked_hgates = np.split(hgates, indices_or_sections=4, axis=1) + + ingate = chunked_igates[0] + chunked_hgates[0] + ingate = sigmoid(ingate) + + forgetgate = chunked_igates[1] + chunked_hgates[1] + forgetgate = sigmoid(forgetgate) + + cellgate = chunked_igates[2] + chunked_hgates[2] + cellgate = tanh(cellgate) + + outgate = chunked_igates[3] + chunked_hgates[3] + outgate = sigmoid(outgate) + + new_cell = (forgetgate * pre_cell_np) + (ingate * cellgate) + new_hidden = outgate * tanh(new_cell) + + return new_hidden, new_cell + + +class TestCudnnLSTM(unittest.TestCase): + def setUp(self): + self.input_size = 100 + self.hidden_size = 200 + self.batch_size = 128 + + def test_run(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with fluid.dygraph.guard(place): + param_attr = fluid.ParamAttr(name="param_attr") + bias_attr = fluid.ParamAttr(name="bias_attr") + named_cudnn_lstm = LSTMCell(self.hidden_size, self.input_size, + param_attr, bias_attr) + cudnn_lstm = LSTMCell(self.hidden_size, self.input_size) + + param_list = cudnn_lstm.state_dict() + named_param_list = named_cudnn_lstm.state_dict() + + # process weight and bias + + weight_ih_name = "_weight_ih" + bias_ih_name = "_bias_ih" + weight_hh_name = "_weight_hh" + bias_hh_name = "_bias_hh" + + weight_ih = param_list[weight_ih_name].numpy() + weight_ih = np.random.uniform( + -0.1, 0.1, size=weight_ih.shape).astype('float64') + param_list[weight_ih_name].set_value(weight_ih) + named_param_list[weight_ih_name].set_value(weight_ih) + + bias_ih = param_list[bias_ih_name].numpy() + bias_ih = np.random.uniform( + -0.1, 0.1, size=bias_ih.shape).astype('float64') + param_list[bias_ih_name].set_value(bias_ih) + named_param_list[bias_ih_name].set_value(bias_ih) + + weight_hh = param_list[weight_hh_name].numpy() + weight_hh = np.random.uniform( + -0.1, 0.1, size=weight_hh.shape).astype('float64') + param_list[weight_hh_name].set_value(weight_hh) + named_param_list[weight_hh_name].set_value(weight_hh) + + bias_hh = param_list[bias_hh_name].numpy() + bias_hh = np.random.uniform( + -0.1, 0.1, size=bias_hh.shape).astype('float64') + param_list[bias_hh_name].set_value(bias_hh) + named_param_list[bias_hh_name].set_value(bias_hh) + + step_input_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.input_size)).astype('float64') + pre_hidden_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.hidden_size)).astype('float64') + pre_cell_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.hidden_size)).astype('float64') + + step_input_var = fluid.dygraph.to_variable(step_input_np) + pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) + pre_cell_var = fluid.dygraph.to_variable(pre_cell_np) + api_out = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var) + named_api_out = named_cudnn_lstm(step_input_var, pre_hidden_var, + pre_cell_var) + + api_hidden_out = api_out[0] + api_cell_out = api_out[1] + named_api_hidden_out = named_api_out[0] + named_api_cell_out = named_api_out[1] + + np_hidden_out, np_cell_out = non_cudnn_step( + step_input_np, pre_hidden_np, pre_cell_np, weight_ih, bias_ih, + weight_hh, bias_hh) + + self.assertTrue( + np.allclose( + api_hidden_out.numpy(), np_hidden_out, rtol=1e-5, atol=0)) + self.assertTrue( + np.allclose( + api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0)) + self.assertTrue( + np.allclose( + named_api_hidden_out.numpy(), + np_hidden_out, + rtol=1e-5, + atol=0)) + self.assertTrue( + np.allclose( + named_api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0)) + + +class TestNonCudnnLSTM(unittest.TestCase): + def setUp(self): + self.input_size = 100 + self.hidden_size = 200 + self.batch_size = 128 + + def test_run(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with fluid.dygraph.guard(place): + param_attr = fluid.ParamAttr(name="param_attr") + bias_attr = fluid.ParamAttr(name="bias_attr") + named_cudnn_lstm = LSTMCell( + self.hidden_size, + self.input_size, + param_attr, + bias_attr, + use_cudnn_impl=False) + cudnn_lstm = LSTMCell( + self.hidden_size, self.input_size, use_cudnn_impl=False) + + param_list = cudnn_lstm.state_dict() + named_param_list = named_cudnn_lstm.state_dict() + + # process weight and bias + + gate_w_name = "_weight" + gate_b_name = "_bias" + + gate_w = param_list[gate_w_name].numpy() + gate_w = np.random.uniform( + -0.1, 0.1, size=gate_w.shape).astype('float64') + param_list[gate_w_name].set_value(gate_w) + named_param_list[gate_w_name].set_value(gate_w) + + gate_b = param_list[gate_b_name].numpy() + gate_b = np.random.uniform( + -0.1, 0.1, size=gate_b.shape).astype('float64') + param_list[gate_b_name].set_value(gate_b) + named_param_list[gate_b_name].set_value(gate_b) + + step_input_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.input_size)).astype('float64') + pre_hidden_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.hidden_size)).astype('float64') + pre_cell_np = np.random.uniform(-0.1, 0.1, ( + self.batch_size, self.hidden_size)).astype('float64') + + step_input_var = fluid.dygraph.to_variable(step_input_np) + pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np) + pre_cell_var = fluid.dygraph.to_variable(pre_cell_np) + api_out = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var) + named_api_out = named_cudnn_lstm(step_input_var, pre_hidden_var, + pre_cell_var) + + api_hidden_out = api_out[0] + api_cell_out = api_out[1] + named_api_hidden_out = named_api_out[0] + named_api_cell_out = named_api_out[1] + + np_hidden_out, np_cell_out = cudnn_step( + step_input_np, pre_hidden_np, pre_cell_np, gate_w, gate_b) + + self.assertTrue( + np.allclose( + api_hidden_out.numpy(), np_hidden_out, rtol=1e-5, atol=0)) + self.assertTrue( + np.allclose( + api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0)) + self.assertTrue( + np.allclose( + named_api_hidden_out.numpy(), + np_hidden_out, + rtol=1e-5, + atol=0)) + self.assertTrue( + np.allclose( + named_api_cell_out.numpy(), np_cell_out, rtol=1e-5, atol=0)) + + +if __name__ == '__main__': + unittest.main() -- GitLab