From 3ba1237e5f46567faa6853485a0776f49865ae35 Mon Sep 17 00:00:00 2001 From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com> Date: Tue, 6 Dec 2022 09:57:04 +0800 Subject: [PATCH] [remove fluid] GRUUnit NCE (#48610) * [remove fluid] GRUUnit NCE * [remove fluid] GRUUnit NCE * [remove fluid] GRUUnit NCE * [remove fluid] GRUUnit NCE --- python/paddle/fluid/dygraph/nn.py | 480 ------------------ .../unittests/dygraph_to_static/test_lac.py | 10 +- .../fluid/tests/unittests/test_gru_unit_op.py | 16 - .../test_imperative_load_static_param.py | 5 +- .../test_imperative_ocr_attention_model.py | 18 +- .../fluid/tests/unittests/test_layers.py | 471 ----------------- .../paddle/fluid/tests/unittests/test_nce.py | 53 -- 7 files changed, 10 insertions(+), 1043 deletions(-) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index abef927af8..39da342c38 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -54,8 +54,6 @@ __all__ = [ 'Linear', 'BatchNorm', 'Embedding', - 'GRUUnit', - 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', @@ -1363,484 +1361,6 @@ class Embedding(layers.Layer): return out -class GRUUnit(layers.Layer): - """ - **GRU unit layer** - - It creates a callable object from GRUUnit class. - If origin_mode is True, then the equation of a gru step is from paper - `Learning Phrase Representations using RNN Encoder-Decoder for Statistical - Machine Translation `_ - - .. math:: - u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) - - r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) - - m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) - - h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) - - If origin_mode is False, then the equation of a gru step is from paper - `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence - Modeling `_ - - .. math:: - u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u) - - r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r) - - m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m) - - h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t) - - - The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms - of the equation above, the :math:`z_t` is split into 3 parts - - :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to - implement a full GRU unit operator for an input, a fully - connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`. - - The terms :math:`u_t` and :math:`r_t` represent the update and reset gates - of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is - an intermediate candidate hidden output, which is denoted by :math:`m_t`. - This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})` - and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`. - - Parameters: - size (int): The input dimension value. - param_attr(ParamAttr, optional): The parameter attribute for the learnable - hidden-hidden weight matrix. - - **Note**: - - 1. The shape of the weight matrix is :math:`[T, 3*D]`, where D is the hidden size. - 2. All elements in the weight matrix can be divided into two parts. The first - part are weights of the update gate and reset gate with shape :math:`[D, 2*D]`, - and the second part are weights for candidate hidden state with shape :math:`[D, D]`. - - - If it is set to None or one attribute of ParamAttr, gru_unit will - create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. The default - value is None. - bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias - of GRU.Note that the bias with :math:`[1, 3*D]` concatenates - the bias in the update gate, reset gate and candidate calculations. - If it is set to False, no bias will be applied to the update gate, - reset gate and candidate calculations. If it is set to None or one - attribute of ParamAttr, gru_unit will create ParamAttr as - bias_attr. If the Initializer of the bias_attr is not set, the bias - is initialized zero. The default value is None. - activation (str): The activation type for cell (actNode). - The default value is 'tanh'. - gate_activation (str): The activation type for gates (actGate). - The default value is 'sigmoid'. - dtype(str): The dtype of the layers. The data type can be set as - 'float32', 'float64'. The default value is 'float32'. - - Attribute: - **weight** (Parameter): the learnable weights of this layer. - - **bias** (Parameter): the learnable bias of this layer. - - Returns: - tuple: The hidden value, reset-hidden value and gate values. The hidden value - is a 2-D tensor with shape :math:`[T, D]` . The reset-hidden value is a - 2-D tensor with shape :math:`[T, D]` . The gate value is a 2-D tensor with - shape :math:`[T, 3*D]`. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle.fluid.dygraph.base as base - import numpy - - lod = [[2, 4, 3]] - D = 5 - T = sum(lod[0]) - - input = numpy.random.rand(T, 3 * D).astype('float32') - hidden_input = numpy.random.rand(T, D).astype('float32') - with fluid.dygraph.guard(): - x = numpy.random.random((3, 32, 32)).astype('float32') - gru = fluid.dygraph.GRUUnit(size=D * 3) - dy_ret = gru( - base.to_variable(input), base.to_variable(hidden_input)) - - """ - - def __init__( - self, - size, - param_attr=None, - bias_attr=None, - activation='tanh', - gate_activation='sigmoid', - origin_mode=False, - dtype='float32', - ): - super().__init__() - self._bias_attr = bias_attr - activation_dict = dict( - identity=0, - sigmoid=1, - tanh=2, - relu=3, - ) - self.activation = activation_dict[activation] - self.gate_activation = activation_dict[gate_activation] - - self._dtype = dtype - size = size // 3 - # create weight - self.weight = self.create_parameter( - attr=param_attr, shape=[size, 3 * size], dtype=dtype - ) - - # create bias - bias_size = [1, 3 * size] - self._bias_size = bias_size - self.bias = self.create_parameter( - attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True - ) - - def forward(self, input, hidden): - if _non_static_mode(): - gate, reset_hidden_pre, updated_hidden = _legacy_C_ops.gru_unit( - input, - hidden, - self.weight, - self.bias, - 'activation', - self.activation, - 'gate_activation', - self.gate_activation, - ) - return updated_hidden, reset_hidden_pre, gate - - check_variable_and_dtype( - input, 'input', ['float32', 'float64'], 'GRUUnit' - ) - check_variable_and_dtype( - hidden, 'hidden', ['float32', 'float64'], 'GRUUnit' - ) - inputs = { - 'Input': [input], - 'HiddenPrev': [hidden], - 'Weight': [self.weight], - } - if self.bias is not None: - inputs['Bias'] = [self.bias] - gate = self._helper.create_variable_for_type_inference(self._dtype) - reset_hidden_pre = self._helper.create_variable_for_type_inference( - self._dtype - ) - updated_hidden = self._helper.create_variable_for_type_inference( - self._dtype - ) - self._helper.append_op( - type='gru_unit', - inputs=inputs, - outputs={ - 'Gate': gate, - 'ResetHiddenPrev': reset_hidden_pre, - 'Hidden': updated_hidden, - }, - attrs={ - 'activation': self.activation, - 'gate_activation': self.gate_activation, - }, - ) - - return updated_hidden, reset_hidden_pre, gate - - -class NCE(layers.Layer): - """ - This interface is used to construct a callable object of the ``NCE`` class. - For more details, refer to code examples. - It implements the function of the ``NCE`` loss function. - By default this function uses a uniform distribution for sampling, and it - compute and return the noise-contrastive estimation training loss. See - `Noise-contrastive estimation: A new estimation principle for unnormalized statistical models `_ . - - Parameters: - num_total_classes (int): Total number of classes in all samples. - dim (int): Dimension of input (possibly embedding dim). - param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter) - of nce. If it is set to None or one attribute of ParamAttr, nce - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr or bool, optional): The attribute for the bias of nce. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, nce - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - num_neg_samples (int, optional): The number of negative classes. The default value is 10. - sampler (str, optional): The sampler used to sample class from negative classes. - It can be 'uniform', 'log_uniform' or 'custom_dist'. - default: 'uniform'. - custom_dist (float[], optional): A float[] with size=num_total_classes. - It is used when sampler is set to 'custom_dist'. - custom_dist[i] is the probability of i-th class to be sampled. - Default: None. - seed (int, optional): The seed used in sampler. Default: 0. - is_sparse(bool, optional): The flag indicating whether to use sparse update. If is_sparse is True, the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default: False. - dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32". - - Attribute: - **weight** (Parameter): the learnable weights of this layer. - - **bias** (Parameter or None): the learnable bias of this layer. - - Returns: - None - - Examples: - .. code-block:: python - - import numpy as np - import paddle.fluid as fluid - - window_size = 5 - dict_size = 20 - label_word = int(window_size // 2) + 1 - inp_word = np.array([[1], [2], [3], [4], [5]]).astype('int64') - nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') - - with fluid.dygraph.guard(): - words = [] - for i in range(window_size): - words.append(fluid.dygraph.base.to_variable(inp_word[i])) - - emb = fluid.Embedding( - size=[dict_size, 32], - param_attr='emb.w', - is_sparse=False) - - embs3 = [] - for i in range(window_size): - if i == label_word: - continue - - emb_rlt = emb(words[i]) - embs3.append(emb_rlt) - - embs3 = fluid.layers.concat(input=embs3, axis=1) - nce = fluid.NCE( - num_total_classes=dict_size, - dim=embs3.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=1, - param_attr='nce.w', - bias_attr='nce.b') - - wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) - nce_loss3 = nce(embs3, wl) - - """ - - def __init__( - self, - num_total_classes, - dim, - sample_weight=None, - param_attr=None, - bias_attr=None, - num_neg_samples=None, - sampler="uniform", - custom_dist=None, - seed=0, - is_sparse=False, - dtype='float32', - ): - super().__init__() - self._param_attr = param_attr - self._bias_attr = bias_attr - self._num_total_classes = num_total_classes - self._dtype = dtype - self._inputs = dict() - self._inputs['SampleWeight'] = ( - sample_weight if sample_weight is not None else [] - ) - if sampler == "uniform": - sampler = 0 - elif sampler == "log_uniform": - sampler = 1 - elif sampler == "custom_dist": - assert custom_dist is not None - # assert isinstance(custom_dist, Variable) - - custom_dist_len = len(custom_dist) - alias_probs_ = [0] * custom_dist_len - alias_ = [0] * custom_dist_len - bigs = [] - littles = [] - for i in range(custom_dist_len): - normal_prob = custom_dist[i] * custom_dist_len - if normal_prob - 1.0 > 0: - bigs.append((i, normal_prob)) - elif 1.0 - normal_prob > 0: - littles.append((i, normal_prob)) - else: - alias_probs_[i] = normal_prob - alias_[i] = -1 - - while len(bigs) and len(littles): - big = bigs.pop(0) - little = littles.pop(0) - - big_idx = big[0] - big_prob = big[1] - - alias_probs_[little[0]] = little[1] - alias_[little[0]] = big_idx - big_left = big[1] + little[1] - 1 - if big_left - 1.0 > 0: - bigs.append((big_idx, big_left)) - elif 1.0 - big_left > 0: - littles.append((big_idx, big_left)) - else: - alias_probs_[big_idx] = big_left - alias_[big_idx] = -1 - - if len(bigs): - big = bigs.pop(0) - alias_probs_[big[0]] = 1.0 - alias_[big[0]] = -1 - if len(littles): - little = littles.pop(0) - alias_probs_[little[0]] = 1.0 - alias_[little[0]] = -1 - - def _init_by_numpy_array(numpy_array): - ret = self.create_parameter( - attr=ParamAttr(), - shape=numpy_array.shape, - dtype=numpy_array.dtype, - default_initializer=NumpyArrayInitializer(numpy_array), - ) - ret.stop_gradient = True - return ret - - self._inputs['CustomDistProbs'] = _init_by_numpy_array( - np.array(custom_dist).astype('float32') - ) - self._inputs['CustomDistAlias'] = _init_by_numpy_array( - np.array(alias_).astype('int32') - ) - self._inputs['CustomDistAliasProbs'] = _init_by_numpy_array( - np.array(alias_probs_).astype('float32') - ) - sampler = 2 - else: - raise Exception("Unsupported sampler type.") - - if num_neg_samples is None: - num_neg_samples = 10 - else: - num_neg_samples = int(num_neg_samples) - self._num_neg_samples = num_neg_samples - remote_prefetch = is_sparse - print( - "With sparse mode, if your models has only small parameter prefetch may cause speed down" - ) - self._attrs = { - 'num_total_classes': int(num_total_classes), - 'num_neg_samples': num_neg_samples, - 'seed': seed, - 'sampler': sampler, - 'is_sparse': is_sparse, - 'remote_prefetch': remote_prefetch, - } - - self.weight = self.create_parameter( - attr=self._param_attr, - shape=[self._num_total_classes, dim], - is_bias=False, - dtype=self._dtype, - ) - if self._bias_attr: - self.bias = self.create_parameter( - attr=self._bias_attr, - shape=[self._num_total_classes, 1], - is_bias=True, - dtype=self._dtype, - ) - self._inputs['Bias'] = self.bias - self._inputs['Weight'] = self.weight - - def forward(self, input, label, sample_weight=None): - if _non_static_mode(): - attrs = ( - 'num_total_classes', - self._attrs['num_total_classes'], - 'num_neg_samples', - self._attrs['num_neg_samples'], - 'seed', - self._attrs['seed'], - 'sampler', - self._attrs['sampler'], - 'is_sparse', - self._attrs['is_sparse'], - 'remote_prefetch', - self._attrs['remote_prefetch'], - ) - cost, _, _ = _legacy_C_ops.nce( - input, - label, - self.weight, - self.bias, - self._inputs['SampleWeight'], - self._inputs['CustomDistProbs'], - self._inputs['CustomDistAlias'], - self._inputs['CustomDistAliasProbs'], - *attrs - ) - return cost / (self._num_neg_samples + 1) - - check_variable_and_dtype(input, "input", ['float32', 'float64'], "NCE") - check_variable_and_dtype(label, "label", ['int64'], "NCE") - check_type( - sample_weight, 'sample_weight', (Variable, type(None)), 'NCE' - ) - assert isinstance(input, Variable) - assert isinstance(label, Variable) - - self._inputs['Input'] = input - self._inputs['Label'] = label - self._inputs['SampleWeight'] = ( - sample_weight if sample_weight is not None else [] - ) - - cost = self._helper.create_variable_for_type_inference( - dtype=input.dtype - ) - sample_logits = self._helper.create_variable_for_type_inference( - dtype=input.dtype - ) - sample_labels = self._helper.create_variable_for_type_inference( - dtype=label.dtype - ) - - self._helper.append_op( - type='nce', - inputs=self._inputs, - outputs={ - 'Cost': cost, - 'SampleLogits': sample_logits, - 'SampleLabels': sample_labels, - }, - attrs=self._attrs, - ) - return cost / (self._num_neg_samples + 1) - - class PRelu(layers.Layer): r""" This interface is used to construct a callable object of the ``PRelu`` class. diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py index 5aff8c710a..dc1eedcc8d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py @@ -25,7 +25,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "2" import paddle import paddle.fluid as fluid from paddle import _legacy_C_ops -from paddle.fluid.dygraph import Embedding, GRUUnit, to_variable +from paddle.fluid.dygraph import Embedding, to_variable from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.framework import _non_static_mode from paddle.jit import ProgramTranslator @@ -57,13 +57,9 @@ class DynamicGRU(fluid.dygraph.Layer): ): super().__init__() - self.gru_unit = GRUUnit( + self.gru_unit = paddle.nn.GRUCell( size * 3, - param_attr=param_attr, - bias_attr=bias_attr, - activation=candidate_activation, - gate_activation=gate_activation, - origin_mode=origin_mode, + size, ) self.size = size diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py index 92fadf591a..edb8c83ced 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py @@ -23,22 +23,6 @@ from paddle.fluid.framework import Program, program_guard from paddle.fluid.layers import gru_unit -class TestGRUUnitAPIError(unittest.TestCase): - def test_errors(self): - with fluid.program_guard(fluid.Program(), fluid.Program()): - D = 5 - layer = fluid.dygraph.nn.GRUUnit(size=D * 3) - # the input must be Variable. - x0 = fluid.create_lod_tensor( - np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace() - ) - self.assertRaises(TypeError, layer, x0) - # the input dtype must be float32 or float64 - x = fluid.data(name='x', shape=[-1, D * 3], dtype='float16') - hidden = fluid.data(name='hidden', shape=[-1, D], dtype='float32') - self.assertRaises(TypeError, layer, x, hidden) - - class GRUActivationType(OpTest): identity = 0 sigmoid = 1 diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py index a98d9b994b..573c1699ac 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py @@ -21,7 +21,7 @@ import numpy as np import paddle import paddle.fluid as fluid import paddle.fluid.framework as framework -from paddle.fluid.dygraph.nn import NCE, BatchNorm, Embedding, GroupNorm, PRelu +from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GroupNorm, PRelu from paddle.nn import Linear @@ -212,9 +212,6 @@ class TestDygraphLoadStatic(unittest.TestCase): self.layer_norm_1 = paddle.nn.LayerNorm([10]) self.layer_norm_2 = paddle.nn.LayerNorm(10) - self.nce1 = NCE(10000, 100) - self.nce2 = NCE(10000, 100) - self.prelu1 = PRelu("channel", channel=5) self.prelu2 = PRelu("channel", channel=5) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 998e9de7ff..4e4c8aa435 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -21,7 +21,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GRUUnit +from paddle.fluid.dygraph.nn import BatchNorm, Embedding from paddle.fluid.framework import _test_eager_guard from paddle.nn import Linear @@ -168,13 +168,9 @@ class DynamicGRU(fluid.dygraph.Layer): ): super().__init__() - self.gru_unit = GRUUnit( + self.gru_unit = paddle.nn.GRUCell( size * 3, - param_attr=param_attr, - bias_attr=bias_attr, - activation=candidate_activation, - gate_activation=gate_activation, - origin_mode=origin_mode, + size, ) self.h_0 = h_0 @@ -189,7 +185,7 @@ class DynamicGRU(fluid.dygraph.Layer): i = inputs.shape[1] - 1 - i input_ = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1]) input_ = paddle.reshape(input_, [-1, input_.shape[2]]) - hidden, reset, gate = self.gru_unit(input_, hidden) + hidden, reset = self.gru_unit(input_, hidden) hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]]) if self.is_reverse: res = [hidden_] + res @@ -330,9 +326,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer): self.fc_2_layer = Linear( decoder_size, decoder_size * 3, bias_attr=False ) - self.gru_unit = GRUUnit( - size=decoder_size * 3, param_attr=None, bias_attr=None - ) + self.gru_unit = paddle.nn.GRUCell(decoder_size * 3, decoder_size) self.out_layer = Linear(decoder_size, num_classes + 2, bias_attr=None) self.decoder_size = decoder_size @@ -357,7 +351,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer): fc_2 = self.fc_2_layer(current_word) decoder_inputs = paddle.add(x=fc_1, y=fc_2) - h, _, _ = self.gru_unit(decoder_inputs, hidden_mem) + h, _ = self.gru_unit(decoder_inputs, hidden_mem) hidden_mem = h out = self.out_layer(h) out = paddle.nn.functional.softmax(out) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index dcf442200d..dcf9d4d100 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -316,147 +316,6 @@ class TestLayer(LayerTest): np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05) np.testing.assert_allclose(static_ret, dy_eager_ret_value, rtol=1e-05) - def test_gru_unit(self): - lod = [[2, 4, 3]] - D = 5 - T = sum(lod[0]) - N = len(lod[0]) - - input = np.random.rand(T, 3 * D).astype('float32') - hidden_input = np.random.rand(T, D).astype('float32') - - with self.static_graph(): - x = layers.data(name='x', shape=[-1, D * 3], dtype='float32') - hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32') - updated_hidden, reset_hidden_pre, gate = layers.gru_unit( - input=x, hidden=hidden, size=D * 3 - ) - static_ret = self.get_static_graph_result( - feed={'x': input, 'hidden': hidden_input}, - fetch_list=[updated_hidden, reset_hidden_pre, gate], - ) - - with self.static_graph(): - x = layers.data(name='x', shape=[-1, D * 3], dtype='float32') - hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32') - updated_hidden, reset_hidden_pre, gate = layers.gru_unit( - input=x, hidden=hidden, size=D * 3 - ) - gru = nn.GRUUnit(size=D * 3) - updated_hidden, reset_hidden_pre, gate = gru(x, hidden) - - static_ret2 = self.get_static_graph_result( - feed={'x': input, 'hidden': hidden_input}, - fetch_list=[updated_hidden, reset_hidden_pre, gate], - ) - - with self.dynamic_graph(): - with _test_eager_guard(): - gru = nn.GRUUnit(size=D * 3) - dy_eager_ret = gru( - base.to_variable(input), base.to_variable(hidden_input) - ) - dy_eager_ret_value = [] - for i in range(len(static_ret)): - dy_eager_ret_value.append(dy_eager_ret[i].numpy()) - - gru = nn.GRUUnit(size=D * 3) - dy_ret = gru( - base.to_variable(input), base.to_variable(hidden_input) - ) - dy_ret_value = [] - for i in range(len(static_ret)): - dy_ret_value.append(dy_ret[i].numpy()) - - for i in range(len(static_ret)): - np.testing.assert_allclose( - static_ret[i], static_ret2[i], rtol=1e-05 - ) - np.testing.assert_allclose( - static_ret[i], dy_ret_value[i], rtol=1e-05 - ) - np.testing.assert_allclose( - static_ret[i], dy_eager_ret_value[i], rtol=1e-05 - ) - - with self.dynamic_graph(): - with _test_eager_guard(): - custom_weight = np.random.randn(D, D * 3).astype("float32") - weight_attr = fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - custom_weight - ) - ) - gru1 = nn.GRUUnit(size=D * 3) - gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr) - dy_ret1 = gru1( - base.to_variable(input), base.to_variable(hidden_input) - ) - dy_ret2 = gru2( - base.to_variable(input), base.to_variable(hidden_input) - ) - self.assertFalse( - np.array_equal(gru1.weight.numpy(), gru2.weight.numpy()) - ) - for o1, o2 in zip(dy_ret1, dy_ret2): - self.assertFalse(np.array_equal(o1.numpy(), o2.numpy())) - gru2.weight.set_value(gru1.weight.numpy()) - gru2.bias.set_value(gru1.bias) - dy_ret1 = gru1( - base.to_variable(input), base.to_variable(hidden_input) - ) - dy_ret2 = gru2( - base.to_variable(input), base.to_variable(hidden_input) - ) - for o1, o2 in zip(dy_ret1, dy_ret2): - np.testing.assert_array_equal(o1.numpy(), o2.numpy()) - - gru2.weight = gru1.weight - gru2.bias = gru1.bias - np.testing.assert_array_equal( - gru1.weight.numpy(), gru2.weight.numpy() - ) - np.testing.assert_array_equal( - gru1.bias.numpy(), gru2.bias.numpy() - ) - - custom_weight = np.random.randn(D, D * 3).astype("float32") - weight_attr = fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - custom_weight - ) - ) - gru1 = nn.GRUUnit(size=D * 3) - gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr) - dy_ret1 = gru1( - base.to_variable(input), base.to_variable(hidden_input) - ) - dy_ret2 = gru2( - base.to_variable(input), base.to_variable(hidden_input) - ) - self.assertFalse( - np.array_equal(gru1.weight.numpy(), gru2.weight.numpy()) - ) - for o1, o2 in zip(dy_ret1, dy_ret2): - self.assertFalse(np.array_equal(o1.numpy(), o2.numpy())) - gru2.weight.set_value(gru1.weight.numpy()) - gru2.bias.set_value(gru1.bias) - dy_ret1 = gru1( - base.to_variable(input), base.to_variable(hidden_input) - ) - dy_ret2 = gru2( - base.to_variable(input), base.to_variable(hidden_input) - ) - for o1, o2 in zip(dy_ret1, dy_ret2): - np.testing.assert_array_equal(o1.numpy(), o2.numpy()) - - gru2.weight = gru1.weight - gru2.bias = gru1.bias - np.testing.assert_array_equal( - gru1.weight.numpy(), gru2.weight.numpy() - ) - np.testing.assert_array_equal(gru1.bias.numpy(), gru2.bias.numpy()) - def test_elementwise_math(self): n = np.ones([3, 3], dtype='float32') n2 = np.ones([3, 3], dtype='float32') * 1.1 @@ -1139,336 +998,6 @@ class TestLayer(LayerTest): emb1.weight.numpy(), emb2.weight.numpy() ) - def test_nce(self): - window_size = 5 - dict_size = 20 - label_word = int(window_size // 2) + 1 - inp_word = np.array([[1], [2], [3], [4], [5]]).astype('int64') - nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') - seed = 1 - with self.static_graph(): - words = [] - for i in range(window_size): - words.append( - layers.data( - name='word_{0}'.format(i), shape=[None], dtype='int64' - ) - ) - sample_weights = layers.fill_constant( - shape=[5, 1], dtype='float32', value=1 - ) - embs = [] - for i in range(window_size): - if i == label_word: - continue - - emb = fluid.embedding( - input=words[i], - size=[dict_size, 32], - param_attr='emb.w', - is_sparse=False, - ) - embs.append(emb) - - embs = layers.concat(input=embs, axis=1) - wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) - nce_loss = paddle.static.nn.nce( - input=embs, - label=wl, - num_total_classes=dict_size, - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr='nce.w', - bias_attr='nce.b', - sample_weight=sample_weights, - ) - feed_dict = dict() - for i in range(window_size): - feed_dict['word_{0}'.format(i)] = inp_word[i] - static_rlt = self.get_static_graph_result( - feed=feed_dict, fetch_list=[nce_loss] - )[0] - - with self.static_graph(): - words = [] - for i in range(window_size): - words.append( - layers.data( - name='word_{0}'.format(i), shape=[None], dtype='int64' - ) - ) - sample_weights = layers.fill_constant( - shape=[5, 1], dtype='float32', value=1 - ) - emb = nn.Embedding( - size=[dict_size, 32], param_attr='emb.w', is_sparse=False - ) - - embs2 = [] - for i in range(window_size): - if i == label_word: - continue - - emb_rlt = emb(words[i]) - embs2.append(emb_rlt) - - embs2 = layers.concat(input=embs2, axis=1) - nce = nn.NCE( - num_total_classes=dict_size, - dim=embs2.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr='nce.w', - bias_attr='nce.b', - sample_weight=sample_weights, - ) - - wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) - nce_loss2 = nce(embs2, wl) - feed_dict = dict() - for i in range(len(words)): - feed_dict['word_{0}'.format(i)] = inp_word[i] - - static_rlt2 = self.get_static_graph_result( - feed=feed_dict, fetch_list=[nce_loss2] - )[0] - - with self.dynamic_graph(): - with _test_eager_guard(): - words = [] - for i in range(window_size): - words.append(base.to_variable(inp_word[i])) - sample_weights = layers.fill_constant( - shape=[5, 1], dtype='float32', value=1 - ) - emb = nn.Embedding( - size=[dict_size, 32], - param_attr='eager_emb.w', - is_sparse=False, - ) - - embs3 = [] - for i in range(window_size): - if i == label_word: - continue - - emb_rlt = emb(words[i]) - embs3.append(emb_rlt) - - embs3 = layers.concat( - input=embs3, axis=fluid.dygraph.to_variable(np.array([1])) - ) - nce = nn.NCE( - num_total_classes=dict_size, - dim=embs3.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr='eager_nce.w', - bias_attr='eager_nce.b', - sample_weight=sample_weights, - ) - - wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) - dy_eager_rlt = nce(embs3, wl) - dy_eager_rlt_value = dy_eager_rlt.numpy() - - words = [] - for i in range(window_size): - words.append(base.to_variable(inp_word[i])) - sample_weights = layers.fill_constant( - shape=[5, 1], dtype='float32', value=1 - ) - emb = nn.Embedding( - size=[dict_size, 32], param_attr='emb.w', is_sparse=False - ) - - embs3 = [] - for i in range(window_size): - if i == label_word: - continue - - emb_rlt = emb(words[i]) - embs3.append(emb_rlt) - - embs3 = layers.concat( - input=embs3, axis=fluid.dygraph.to_variable(np.array([1])) - ) - nce = nn.NCE( - num_total_classes=dict_size, - dim=embs3.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr='nce.w', - bias_attr='nce.b', - sample_weight=sample_weights, - ) - - wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) - dy_rlt = nce(embs3, wl) - dy_rlt_value = dy_rlt.numpy() - - np.testing.assert_allclose(static_rlt2, static_rlt, rtol=1e-05) - np.testing.assert_allclose(dy_rlt_value, static_rlt, rtol=1e-05) - np.testing.assert_allclose(dy_eager_rlt_value, static_rlt, rtol=1e-05) - - with self.dynamic_graph(): - with _test_eager_guard(): - custom_weight = np.random.randn(dict_size, 128).astype( - "float32" - ) - weight_attr = fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - custom_weight - ) - ) - words = [] - for i in range(window_size): - words.append(base.to_variable(inp_word[i])) - sample_weights = layers.fill_constant( - shape=fluid.dygraph.to_variable(np.array([5, 1])), - dtype='float32', - value=1, - ) - emb = nn.Embedding( - size=[dict_size, 32], - param_attr='eager_emb.w', - is_sparse=False, - ) - - embs3 = [] - for i in range(window_size): - if i == label_word: - continue - - emb_rlt = emb(words[i]) - embs3.append(emb_rlt) - - embs3 = layers.concat(input=embs3, axis=1) - nce1 = nn.NCE( - num_total_classes=dict_size, - dim=embs3.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr='eager_nce1.w', - bias_attr='eager_nce1.b', - sample_weight=sample_weights, - ) - - nce2 = nn.NCE( - num_total_classes=dict_size, - dim=embs3.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr=weight_attr, - bias_attr='eager_nce2.b', - sample_weight=sample_weights, - ) - - wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) - nce1_loss = nce1(embs3, wl) - nce2_loss = nce2(embs3, wl) - self.assertFalse( - np.array_equal(nce1_loss.numpy(), nce2_loss.numpy()) - ) - nce2.weight.set_value(nce1.weight.numpy()) - nce2.bias.set_value(nce1.bias) - nce1_loss = nce1(embs3, wl) - nce2_loss = nce2(embs3, wl) - np.testing.assert_array_equal( - nce1_loss.numpy(), nce2_loss.numpy() - ) - - nce2.weight = nce1.weight - nce2.bias = nce1.bias - np.testing.assert_array_equal( - nce1.weight.numpy(), nce2.weight.numpy() - ) - np.testing.assert_array_equal( - nce1.bias.numpy(), nce2.bias.numpy() - ) - - custom_weight = np.random.randn(dict_size, 128).astype("float32") - weight_attr = fluid.ParamAttr( - initializer=fluid.initializer.NumpyArrayInitializer( - custom_weight - ) - ) - words = [] - for i in range(window_size): - words.append(base.to_variable(inp_word[i])) - sample_weights = layers.fill_constant( - shape=fluid.dygraph.to_variable(np.array([5, 1])), - dtype='float32', - value=1, - ) - emb = nn.Embedding( - size=[dict_size, 32], param_attr='emb.w', is_sparse=False - ) - - embs3 = [] - for i in range(window_size): - if i == label_word: - continue - - emb_rlt = emb(words[i]) - embs3.append(emb_rlt) - - embs3 = layers.concat(input=embs3, axis=1) - nce1 = nn.NCE( - num_total_classes=dict_size, - dim=embs3.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr='nce1.w', - bias_attr='nce1.b', - sample_weight=sample_weights, - ) - - nce2 = nn.NCE( - num_total_classes=dict_size, - dim=embs3.shape[1], - num_neg_samples=2, - sampler="custom_dist", - custom_dist=nid_freq_arr.tolist(), - seed=seed, - param_attr=weight_attr, - bias_attr='nce2.b', - sample_weight=sample_weights, - ) - - wl = fluid.layers.unsqueeze(words[label_word], axes=[0]) - nce1_loss = nce1(embs3, wl) - nce2_loss = nce2(embs3, wl) - self.assertFalse( - np.array_equal(nce1_loss.numpy(), nce2_loss.numpy()) - ) - nce2.weight.set_value(nce1.weight.numpy()) - nce2.bias.set_value(nce1.bias) - nce1_loss = nce1(embs3, wl) - nce2_loss = nce2(embs3, wl) - np.testing.assert_array_equal(nce1_loss.numpy(), nce2_loss.numpy()) - - nce2.weight = nce1.weight - nce2.bias = nce1.bias - np.testing.assert_array_equal( - nce1.weight.numpy(), nce2.weight.numpy() - ) - np.testing.assert_array_equal(nce1.bias.numpy(), nce2.bias.numpy()) - def test_one_hot(self): with self.dynamic_graph(): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py index 359cc50fb5..ee51b0d608 100644 --- a/python/paddle/fluid/tests/unittests/test_nce.py +++ b/python/paddle/fluid/tests/unittests/test_nce.py @@ -330,58 +330,5 @@ class TestNCE_OpError(unittest.TestCase): ) -class TestDygraphNCE_OpError(unittest.TestCase): - def test_NCE_errors(self): - with program_guard(Program(), Program()): - nce = fluid.NCE(20, 5) - input1 = fluid.create_lod_tensor( - np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace() - ) - label1 = fluid.layers.data( - name='label1', shape=[-1, 4], dtype="int64" - ) - # the input(input) of NCE layer must be Variable. - self.assertRaises(TypeError, nce, input1, label1) - - input2 = fluid.layers.data( - name='input2', shape=[-1, 4], dtype="float32" - ) - label2 = fluid.create_lod_tensor( - np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace() - ) - # the input(label) of NCE layer must be Variable. - self.assertRaises(TypeError, nce, input2, label2) - - input3 = fluid.layers.data( - name='input3', shape=[-1, 4], dtype="float16" - ) - label3 = fluid.layers.data( - name='label3', shape=[-1, 1], dtype="int64" - ) - # the data type of input(input) must be float32 or float64. - self.assertRaises(TypeError, nce, input3, label3) - - input4 = fluid.layers.data( - name='input4', shape=[-1, 4], dtype="float32" - ) - label4 = fluid.layers.data( - name='label4', shape=[-1, 1], dtype="int32" - ) - # the data type of input(label) must be int64. - self.assertRaises(TypeError, nce, input4, label4) - - input5 = fluid.layers.data( - name='input5', shape=[-1, 4], dtype="float32" - ) - label5 = fluid.layers.data( - name='label5', shape=[-1, 1], dtype="int64" - ) - sample_weight = fluid.create_lod_tensor( - np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace() - ) - # the sample_weight of nce must be Variable or None. - self.assertRaises(TypeError, nce, input5, label5, sample_weight) - - if __name__ == '__main__': unittest.main() -- GitLab