未验证 提交 3ba1237e 编写于 作者: W wangzhen38 提交者: GitHub

[remove fluid] GRUUnit NCE (#48610)

* [remove fluid] GRUUnit NCE

* [remove fluid] GRUUnit NCE

* [remove fluid] GRUUnit NCE

* [remove fluid] GRUUnit NCE
上级 a0f43889
...@@ -54,8 +54,6 @@ __all__ = [ ...@@ -54,8 +54,6 @@ __all__ = [
'Linear', 'Linear',
'BatchNorm', 'BatchNorm',
'Embedding', 'Embedding',
'GRUUnit',
'NCE',
'PRelu', 'PRelu',
'BilinearTensorProduct', 'BilinearTensorProduct',
'Conv2DTranspose', 'Conv2DTranspose',
...@@ -1363,484 +1361,6 @@ class Embedding(layers.Layer): ...@@ -1363,484 +1361,6 @@ class Embedding(layers.Layer):
return out return out
class GRUUnit(layers.Layer):
"""
**GRU unit layer**
It creates a callable object from GRUUnit class.
If origin_mode is True, then the equation of a gru step is from paper
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
If origin_mode is False, then the equation of a gru step is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
of the equation above, the :math:`z_t` is split into 3 parts -
:math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
implement a full GRU unit operator for an input, a fully
connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
an intermediate candidate hidden output, which is denoted by :math:`m_t`.
This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
Parameters:
size (int): The input dimension value.
param_attr(ParamAttr, optional): The parameter attribute for the learnable
hidden-hidden weight matrix.
**Note**:
1. The shape of the weight matrix is :math:`[T, 3*D]`, where D is the hidden size.
2. All elements in the weight matrix can be divided into two parts. The first
part are weights of the update gate and reset gate with shape :math:`[D, 2*D]`,
and the second part are weights for candidate hidden state with shape :math:`[D, D]`.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. The default
value is None.
bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias
of GRU.Note that the bias with :math:`[1, 3*D]` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. The default value is None.
activation (str): The activation type for cell (actNode).
The default value is 'tanh'.
gate_activation (str): The activation type for gates (actGate).
The default value is 'sigmoid'.
dtype(str): The dtype of the layers. The data type can be set as
'float32', 'float64'. The default value is 'float32'.
Attribute:
**weight** (Parameter): the learnable weights of this layer.
**bias** (Parameter): the learnable bias of this layer.
Returns:
tuple: The hidden value, reset-hidden value and gate values. The hidden value
is a 2-D tensor with shape :math:`[T, D]` . The reset-hidden value is a
2-D tensor with shape :math:`[T, D]` . The gate value is a 2-D tensor with
shape :math:`[T, 3*D]`.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.dygraph.base as base
import numpy
lod = [[2, 4, 3]]
D = 5
T = sum(lod[0])
input = numpy.random.rand(T, 3 * D).astype('float32')
hidden_input = numpy.random.rand(T, D).astype('float32')
with fluid.dygraph.guard():
x = numpy.random.random((3, 32, 32)).astype('float32')
gru = fluid.dygraph.GRUUnit(size=D * 3)
dy_ret = gru(
base.to_variable(input), base.to_variable(hidden_input))
"""
def __init__(
self,
size,
param_attr=None,
bias_attr=None,
activation='tanh',
gate_activation='sigmoid',
origin_mode=False,
dtype='float32',
):
super().__init__()
self._bias_attr = bias_attr
activation_dict = dict(
identity=0,
sigmoid=1,
tanh=2,
relu=3,
)
self.activation = activation_dict[activation]
self.gate_activation = activation_dict[gate_activation]
self._dtype = dtype
size = size // 3
# create weight
self.weight = self.create_parameter(
attr=param_attr, shape=[size, 3 * size], dtype=dtype
)
# create bias
bias_size = [1, 3 * size]
self._bias_size = bias_size
self.bias = self.create_parameter(
attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True
)
def forward(self, input, hidden):
if _non_static_mode():
gate, reset_hidden_pre, updated_hidden = _legacy_C_ops.gru_unit(
input,
hidden,
self.weight,
self.bias,
'activation',
self.activation,
'gate_activation',
self.gate_activation,
)
return updated_hidden, reset_hidden_pre, gate
check_variable_and_dtype(
input, 'input', ['float32', 'float64'], 'GRUUnit'
)
check_variable_and_dtype(
hidden, 'hidden', ['float32', 'float64'], 'GRUUnit'
)
inputs = {
'Input': [input],
'HiddenPrev': [hidden],
'Weight': [self.weight],
}
if self.bias is not None:
inputs['Bias'] = [self.bias]
gate = self._helper.create_variable_for_type_inference(self._dtype)
reset_hidden_pre = self._helper.create_variable_for_type_inference(
self._dtype
)
updated_hidden = self._helper.create_variable_for_type_inference(
self._dtype
)
self._helper.append_op(
type='gru_unit',
inputs=inputs,
outputs={
'Gate': gate,
'ResetHiddenPrev': reset_hidden_pre,
'Hidden': updated_hidden,
},
attrs={
'activation': self.activation,
'gate_activation': self.gate_activation,
},
)
return updated_hidden, reset_hidden_pre, gate
class NCE(layers.Layer):
"""
This interface is used to construct a callable object of the ``NCE`` class.
For more details, refer to code examples.
It implements the function of the ``NCE`` loss function.
By default this function uses a uniform distribution for sampling, and it
compute and return the noise-contrastive estimation training loss. See
`Noise-contrastive estimation: A new estimation principle for unnormalized statistical models <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_ .
Parameters:
num_total_classes (int): Total number of classes in all samples.
dim (int): Dimension of input (possibly embedding dim).
param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
of nce. If it is set to None or one attribute of ParamAttr, nce
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr or bool, optional): The attribute for the bias of nce.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, nce
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
num_neg_samples (int, optional): The number of negative classes. The default value is 10.
sampler (str, optional): The sampler used to sample class from negative classes.
It can be 'uniform', 'log_uniform' or 'custom_dist'.
default: 'uniform'.
custom_dist (float[], optional): A float[] with size=num_total_classes.
It is used when sampler is set to 'custom_dist'.
custom_dist[i] is the probability of i-th class to be sampled.
Default: None.
seed (int, optional): The seed used in sampler. Default: 0.
is_sparse(bool, optional): The flag indicating whether to use sparse update. If is_sparse is True, the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default: False.
dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
Attribute:
**weight** (Parameter): the learnable weights of this layer.
**bias** (Parameter or None): the learnable bias of this layer.
Returns:
None
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
window_size = 5
dict_size = 20
label_word = int(window_size // 2) + 1
inp_word = np.array([[1], [2], [3], [4], [5]]).astype('int64')
nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
with fluid.dygraph.guard():
words = []
for i in range(window_size):
words.append(fluid.dygraph.base.to_variable(inp_word[i]))
emb = fluid.Embedding(
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False)
embs3 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs3.append(emb_rlt)
embs3 = fluid.layers.concat(input=embs3, axis=1)
nce = fluid.NCE(
num_total_classes=dict_size,
dim=embs3.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=1,
param_attr='nce.w',
bias_attr='nce.b')
wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
nce_loss3 = nce(embs3, wl)
"""
def __init__(
self,
num_total_classes,
dim,
sample_weight=None,
param_attr=None,
bias_attr=None,
num_neg_samples=None,
sampler="uniform",
custom_dist=None,
seed=0,
is_sparse=False,
dtype='float32',
):
super().__init__()
self._param_attr = param_attr
self._bias_attr = bias_attr
self._num_total_classes = num_total_classes
self._dtype = dtype
self._inputs = dict()
self._inputs['SampleWeight'] = (
sample_weight if sample_weight is not None else []
)
if sampler == "uniform":
sampler = 0
elif sampler == "log_uniform":
sampler = 1
elif sampler == "custom_dist":
assert custom_dist is not None
# assert isinstance(custom_dist, Variable)
custom_dist_len = len(custom_dist)
alias_probs_ = [0] * custom_dist_len
alias_ = [0] * custom_dist_len
bigs = []
littles = []
for i in range(custom_dist_len):
normal_prob = custom_dist[i] * custom_dist_len
if normal_prob - 1.0 > 0:
bigs.append((i, normal_prob))
elif 1.0 - normal_prob > 0:
littles.append((i, normal_prob))
else:
alias_probs_[i] = normal_prob
alias_[i] = -1
while len(bigs) and len(littles):
big = bigs.pop(0)
little = littles.pop(0)
big_idx = big[0]
big_prob = big[1]
alias_probs_[little[0]] = little[1]
alias_[little[0]] = big_idx
big_left = big[1] + little[1] - 1
if big_left - 1.0 > 0:
bigs.append((big_idx, big_left))
elif 1.0 - big_left > 0:
littles.append((big_idx, big_left))
else:
alias_probs_[big_idx] = big_left
alias_[big_idx] = -1
if len(bigs):
big = bigs.pop(0)
alias_probs_[big[0]] = 1.0
alias_[big[0]] = -1
if len(littles):
little = littles.pop(0)
alias_probs_[little[0]] = 1.0
alias_[little[0]] = -1
def _init_by_numpy_array(numpy_array):
ret = self.create_parameter(
attr=ParamAttr(),
shape=numpy_array.shape,
dtype=numpy_array.dtype,
default_initializer=NumpyArrayInitializer(numpy_array),
)
ret.stop_gradient = True
return ret
self._inputs['CustomDistProbs'] = _init_by_numpy_array(
np.array(custom_dist).astype('float32')
)
self._inputs['CustomDistAlias'] = _init_by_numpy_array(
np.array(alias_).astype('int32')
)
self._inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
np.array(alias_probs_).astype('float32')
)
sampler = 2
else:
raise Exception("Unsupported sampler type.")
if num_neg_samples is None:
num_neg_samples = 10
else:
num_neg_samples = int(num_neg_samples)
self._num_neg_samples = num_neg_samples
remote_prefetch = is_sparse
print(
"With sparse mode, if your models has only small parameter prefetch may cause speed down"
)
self._attrs = {
'num_total_classes': int(num_total_classes),
'num_neg_samples': num_neg_samples,
'seed': seed,
'sampler': sampler,
'is_sparse': is_sparse,
'remote_prefetch': remote_prefetch,
}
self.weight = self.create_parameter(
attr=self._param_attr,
shape=[self._num_total_classes, dim],
is_bias=False,
dtype=self._dtype,
)
if self._bias_attr:
self.bias = self.create_parameter(
attr=self._bias_attr,
shape=[self._num_total_classes, 1],
is_bias=True,
dtype=self._dtype,
)
self._inputs['Bias'] = self.bias
self._inputs['Weight'] = self.weight
def forward(self, input, label, sample_weight=None):
if _non_static_mode():
attrs = (
'num_total_classes',
self._attrs['num_total_classes'],
'num_neg_samples',
self._attrs['num_neg_samples'],
'seed',
self._attrs['seed'],
'sampler',
self._attrs['sampler'],
'is_sparse',
self._attrs['is_sparse'],
'remote_prefetch',
self._attrs['remote_prefetch'],
)
cost, _, _ = _legacy_C_ops.nce(
input,
label,
self.weight,
self.bias,
self._inputs['SampleWeight'],
self._inputs['CustomDistProbs'],
self._inputs['CustomDistAlias'],
self._inputs['CustomDistAliasProbs'],
*attrs
)
return cost / (self._num_neg_samples + 1)
check_variable_and_dtype(input, "input", ['float32', 'float64'], "NCE")
check_variable_and_dtype(label, "label", ['int64'], "NCE")
check_type(
sample_weight, 'sample_weight', (Variable, type(None)), 'NCE'
)
assert isinstance(input, Variable)
assert isinstance(label, Variable)
self._inputs['Input'] = input
self._inputs['Label'] = label
self._inputs['SampleWeight'] = (
sample_weight if sample_weight is not None else []
)
cost = self._helper.create_variable_for_type_inference(
dtype=input.dtype
)
sample_logits = self._helper.create_variable_for_type_inference(
dtype=input.dtype
)
sample_labels = self._helper.create_variable_for_type_inference(
dtype=label.dtype
)
self._helper.append_op(
type='nce',
inputs=self._inputs,
outputs={
'Cost': cost,
'SampleLogits': sample_logits,
'SampleLabels': sample_labels,
},
attrs=self._attrs,
)
return cost / (self._num_neg_samples + 1)
class PRelu(layers.Layer): class PRelu(layers.Layer):
r""" r"""
This interface is used to construct a callable object of the ``PRelu`` class. This interface is used to construct a callable object of the ``PRelu`` class.
......
...@@ -25,7 +25,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "2" ...@@ -25,7 +25,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle import _legacy_C_ops from paddle import _legacy_C_ops
from paddle.fluid.dygraph import Embedding, GRUUnit, to_variable from paddle.fluid.dygraph import Embedding, to_variable
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from paddle.fluid.framework import _non_static_mode from paddle.fluid.framework import _non_static_mode
from paddle.jit import ProgramTranslator from paddle.jit import ProgramTranslator
...@@ -57,13 +57,9 @@ class DynamicGRU(fluid.dygraph.Layer): ...@@ -57,13 +57,9 @@ class DynamicGRU(fluid.dygraph.Layer):
): ):
super().__init__() super().__init__()
self.gru_unit = GRUUnit( self.gru_unit = paddle.nn.GRUCell(
size * 3, size * 3,
param_attr=param_attr, size,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode,
) )
self.size = size self.size = size
......
...@@ -23,22 +23,6 @@ from paddle.fluid.framework import Program, program_guard ...@@ -23,22 +23,6 @@ from paddle.fluid.framework import Program, program_guard
from paddle.fluid.layers import gru_unit from paddle.fluid.layers import gru_unit
class TestGRUUnitAPIError(unittest.TestCase):
def test_errors(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
D = 5
layer = fluid.dygraph.nn.GRUUnit(size=D * 3)
# the input must be Variable.
x0 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
)
self.assertRaises(TypeError, layer, x0)
# the input dtype must be float32 or float64
x = fluid.data(name='x', shape=[-1, D * 3], dtype='float16')
hidden = fluid.data(name='hidden', shape=[-1, D], dtype='float32')
self.assertRaises(TypeError, layer, x, hidden)
class GRUActivationType(OpTest): class GRUActivationType(OpTest):
identity = 0 identity = 0
sigmoid = 1 sigmoid = 1
......
...@@ -21,7 +21,7 @@ import numpy as np ...@@ -21,7 +21,7 @@ import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
from paddle.fluid.dygraph.nn import NCE, BatchNorm, Embedding, GroupNorm, PRelu from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GroupNorm, PRelu
from paddle.nn import Linear from paddle.nn import Linear
...@@ -212,9 +212,6 @@ class TestDygraphLoadStatic(unittest.TestCase): ...@@ -212,9 +212,6 @@ class TestDygraphLoadStatic(unittest.TestCase):
self.layer_norm_1 = paddle.nn.LayerNorm([10]) self.layer_norm_1 = paddle.nn.LayerNorm([10])
self.layer_norm_2 = paddle.nn.LayerNorm(10) self.layer_norm_2 = paddle.nn.LayerNorm(10)
self.nce1 = NCE(10000, 100)
self.nce2 = NCE(10000, 100)
self.prelu1 = PRelu("channel", channel=5) self.prelu1 = PRelu("channel", channel=5)
self.prelu2 = PRelu("channel", channel=5) self.prelu2 = PRelu("channel", channel=5)
......
...@@ -21,7 +21,7 @@ import paddle ...@@ -21,7 +21,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GRUUnit from paddle.fluid.dygraph.nn import BatchNorm, Embedding
from paddle.fluid.framework import _test_eager_guard from paddle.fluid.framework import _test_eager_guard
from paddle.nn import Linear from paddle.nn import Linear
...@@ -168,13 +168,9 @@ class DynamicGRU(fluid.dygraph.Layer): ...@@ -168,13 +168,9 @@ class DynamicGRU(fluid.dygraph.Layer):
): ):
super().__init__() super().__init__()
self.gru_unit = GRUUnit( self.gru_unit = paddle.nn.GRUCell(
size * 3, size * 3,
param_attr=param_attr, size,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode,
) )
self.h_0 = h_0 self.h_0 = h_0
...@@ -189,7 +185,7 @@ class DynamicGRU(fluid.dygraph.Layer): ...@@ -189,7 +185,7 @@ class DynamicGRU(fluid.dygraph.Layer):
i = inputs.shape[1] - 1 - i i = inputs.shape[1] - 1 - i
input_ = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1]) input_ = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1])
input_ = paddle.reshape(input_, [-1, input_.shape[2]]) input_ = paddle.reshape(input_, [-1, input_.shape[2]])
hidden, reset, gate = self.gru_unit(input_, hidden) hidden, reset = self.gru_unit(input_, hidden)
hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]]) hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
if self.is_reverse: if self.is_reverse:
res = [hidden_] + res res = [hidden_] + res
...@@ -330,9 +326,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer): ...@@ -330,9 +326,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
self.fc_2_layer = Linear( self.fc_2_layer = Linear(
decoder_size, decoder_size * 3, bias_attr=False decoder_size, decoder_size * 3, bias_attr=False
) )
self.gru_unit = GRUUnit( self.gru_unit = paddle.nn.GRUCell(decoder_size * 3, decoder_size)
size=decoder_size * 3, param_attr=None, bias_attr=None
)
self.out_layer = Linear(decoder_size, num_classes + 2, bias_attr=None) self.out_layer = Linear(decoder_size, num_classes + 2, bias_attr=None)
self.decoder_size = decoder_size self.decoder_size = decoder_size
...@@ -357,7 +351,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer): ...@@ -357,7 +351,7 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
fc_2 = self.fc_2_layer(current_word) fc_2 = self.fc_2_layer(current_word)
decoder_inputs = paddle.add(x=fc_1, y=fc_2) decoder_inputs = paddle.add(x=fc_1, y=fc_2)
h, _, _ = self.gru_unit(decoder_inputs, hidden_mem) h, _ = self.gru_unit(decoder_inputs, hidden_mem)
hidden_mem = h hidden_mem = h
out = self.out_layer(h) out = self.out_layer(h)
out = paddle.nn.functional.softmax(out) out = paddle.nn.functional.softmax(out)
......
...@@ -316,147 +316,6 @@ class TestLayer(LayerTest): ...@@ -316,147 +316,6 @@ class TestLayer(LayerTest):
np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05) np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05)
np.testing.assert_allclose(static_ret, dy_eager_ret_value, rtol=1e-05) np.testing.assert_allclose(static_ret, dy_eager_ret_value, rtol=1e-05)
def test_gru_unit(self):
lod = [[2, 4, 3]]
D = 5
T = sum(lod[0])
N = len(lod[0])
input = np.random.rand(T, 3 * D).astype('float32')
hidden_input = np.random.rand(T, D).astype('float32')
with self.static_graph():
x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
input=x, hidden=hidden, size=D * 3
)
static_ret = self.get_static_graph_result(
feed={'x': input, 'hidden': hidden_input},
fetch_list=[updated_hidden, reset_hidden_pre, gate],
)
with self.static_graph():
x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
input=x, hidden=hidden, size=D * 3
)
gru = nn.GRUUnit(size=D * 3)
updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
static_ret2 = self.get_static_graph_result(
feed={'x': input, 'hidden': hidden_input},
fetch_list=[updated_hidden, reset_hidden_pre, gate],
)
with self.dynamic_graph():
with _test_eager_guard():
gru = nn.GRUUnit(size=D * 3)
dy_eager_ret = gru(
base.to_variable(input), base.to_variable(hidden_input)
)
dy_eager_ret_value = []
for i in range(len(static_ret)):
dy_eager_ret_value.append(dy_eager_ret[i].numpy())
gru = nn.GRUUnit(size=D * 3)
dy_ret = gru(
base.to_variable(input), base.to_variable(hidden_input)
)
dy_ret_value = []
for i in range(len(static_ret)):
dy_ret_value.append(dy_ret[i].numpy())
for i in range(len(static_ret)):
np.testing.assert_allclose(
static_ret[i], static_ret2[i], rtol=1e-05
)
np.testing.assert_allclose(
static_ret[i], dy_ret_value[i], rtol=1e-05
)
np.testing.assert_allclose(
static_ret[i], dy_eager_ret_value[i], rtol=1e-05
)
with self.dynamic_graph():
with _test_eager_guard():
custom_weight = np.random.randn(D, D * 3).astype("float32")
weight_attr = fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
custom_weight
)
)
gru1 = nn.GRUUnit(size=D * 3)
gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr)
dy_ret1 = gru1(
base.to_variable(input), base.to_variable(hidden_input)
)
dy_ret2 = gru2(
base.to_variable(input), base.to_variable(hidden_input)
)
self.assertFalse(
np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())
)
for o1, o2 in zip(dy_ret1, dy_ret2):
self.assertFalse(np.array_equal(o1.numpy(), o2.numpy()))
gru2.weight.set_value(gru1.weight.numpy())
gru2.bias.set_value(gru1.bias)
dy_ret1 = gru1(
base.to_variable(input), base.to_variable(hidden_input)
)
dy_ret2 = gru2(
base.to_variable(input), base.to_variable(hidden_input)
)
for o1, o2 in zip(dy_ret1, dy_ret2):
np.testing.assert_array_equal(o1.numpy(), o2.numpy())
gru2.weight = gru1.weight
gru2.bias = gru1.bias
np.testing.assert_array_equal(
gru1.weight.numpy(), gru2.weight.numpy()
)
np.testing.assert_array_equal(
gru1.bias.numpy(), gru2.bias.numpy()
)
custom_weight = np.random.randn(D, D * 3).astype("float32")
weight_attr = fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
custom_weight
)
)
gru1 = nn.GRUUnit(size=D * 3)
gru2 = nn.GRUUnit(size=D * 3, param_attr=weight_attr)
dy_ret1 = gru1(
base.to_variable(input), base.to_variable(hidden_input)
)
dy_ret2 = gru2(
base.to_variable(input), base.to_variable(hidden_input)
)
self.assertFalse(
np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())
)
for o1, o2 in zip(dy_ret1, dy_ret2):
self.assertFalse(np.array_equal(o1.numpy(), o2.numpy()))
gru2.weight.set_value(gru1.weight.numpy())
gru2.bias.set_value(gru1.bias)
dy_ret1 = gru1(
base.to_variable(input), base.to_variable(hidden_input)
)
dy_ret2 = gru2(
base.to_variable(input), base.to_variable(hidden_input)
)
for o1, o2 in zip(dy_ret1, dy_ret2):
np.testing.assert_array_equal(o1.numpy(), o2.numpy())
gru2.weight = gru1.weight
gru2.bias = gru1.bias
np.testing.assert_array_equal(
gru1.weight.numpy(), gru2.weight.numpy()
)
np.testing.assert_array_equal(gru1.bias.numpy(), gru2.bias.numpy())
def test_elementwise_math(self): def test_elementwise_math(self):
n = np.ones([3, 3], dtype='float32') n = np.ones([3, 3], dtype='float32')
n2 = np.ones([3, 3], dtype='float32') * 1.1 n2 = np.ones([3, 3], dtype='float32') * 1.1
...@@ -1139,336 +998,6 @@ class TestLayer(LayerTest): ...@@ -1139,336 +998,6 @@ class TestLayer(LayerTest):
emb1.weight.numpy(), emb2.weight.numpy() emb1.weight.numpy(), emb2.weight.numpy()
) )
def test_nce(self):
window_size = 5
dict_size = 20
label_word = int(window_size // 2) + 1
inp_word = np.array([[1], [2], [3], [4], [5]]).astype('int64')
nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
seed = 1
with self.static_graph():
words = []
for i in range(window_size):
words.append(
layers.data(
name='word_{0}'.format(i), shape=[None], dtype='int64'
)
)
sample_weights = layers.fill_constant(
shape=[5, 1], dtype='float32', value=1
)
embs = []
for i in range(window_size):
if i == label_word:
continue
emb = fluid.embedding(
input=words[i],
size=[dict_size, 32],
param_attr='emb.w',
is_sparse=False,
)
embs.append(emb)
embs = layers.concat(input=embs, axis=1)
wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
nce_loss = paddle.static.nn.nce(
input=embs,
label=wl,
num_total_classes=dict_size,
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='nce.w',
bias_attr='nce.b',
sample_weight=sample_weights,
)
feed_dict = dict()
for i in range(window_size):
feed_dict['word_{0}'.format(i)] = inp_word[i]
static_rlt = self.get_static_graph_result(
feed=feed_dict, fetch_list=[nce_loss]
)[0]
with self.static_graph():
words = []
for i in range(window_size):
words.append(
layers.data(
name='word_{0}'.format(i), shape=[None], dtype='int64'
)
)
sample_weights = layers.fill_constant(
shape=[5, 1], dtype='float32', value=1
)
emb = nn.Embedding(
size=[dict_size, 32], param_attr='emb.w', is_sparse=False
)
embs2 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs2.append(emb_rlt)
embs2 = layers.concat(input=embs2, axis=1)
nce = nn.NCE(
num_total_classes=dict_size,
dim=embs2.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='nce.w',
bias_attr='nce.b',
sample_weight=sample_weights,
)
wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
nce_loss2 = nce(embs2, wl)
feed_dict = dict()
for i in range(len(words)):
feed_dict['word_{0}'.format(i)] = inp_word[i]
static_rlt2 = self.get_static_graph_result(
feed=feed_dict, fetch_list=[nce_loss2]
)[0]
with self.dynamic_graph():
with _test_eager_guard():
words = []
for i in range(window_size):
words.append(base.to_variable(inp_word[i]))
sample_weights = layers.fill_constant(
shape=[5, 1], dtype='float32', value=1
)
emb = nn.Embedding(
size=[dict_size, 32],
param_attr='eager_emb.w',
is_sparse=False,
)
embs3 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs3.append(emb_rlt)
embs3 = layers.concat(
input=embs3, axis=fluid.dygraph.to_variable(np.array([1]))
)
nce = nn.NCE(
num_total_classes=dict_size,
dim=embs3.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='eager_nce.w',
bias_attr='eager_nce.b',
sample_weight=sample_weights,
)
wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
dy_eager_rlt = nce(embs3, wl)
dy_eager_rlt_value = dy_eager_rlt.numpy()
words = []
for i in range(window_size):
words.append(base.to_variable(inp_word[i]))
sample_weights = layers.fill_constant(
shape=[5, 1], dtype='float32', value=1
)
emb = nn.Embedding(
size=[dict_size, 32], param_attr='emb.w', is_sparse=False
)
embs3 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs3.append(emb_rlt)
embs3 = layers.concat(
input=embs3, axis=fluid.dygraph.to_variable(np.array([1]))
)
nce = nn.NCE(
num_total_classes=dict_size,
dim=embs3.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='nce.w',
bias_attr='nce.b',
sample_weight=sample_weights,
)
wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
dy_rlt = nce(embs3, wl)
dy_rlt_value = dy_rlt.numpy()
np.testing.assert_allclose(static_rlt2, static_rlt, rtol=1e-05)
np.testing.assert_allclose(dy_rlt_value, static_rlt, rtol=1e-05)
np.testing.assert_allclose(dy_eager_rlt_value, static_rlt, rtol=1e-05)
with self.dynamic_graph():
with _test_eager_guard():
custom_weight = np.random.randn(dict_size, 128).astype(
"float32"
)
weight_attr = fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
custom_weight
)
)
words = []
for i in range(window_size):
words.append(base.to_variable(inp_word[i]))
sample_weights = layers.fill_constant(
shape=fluid.dygraph.to_variable(np.array([5, 1])),
dtype='float32',
value=1,
)
emb = nn.Embedding(
size=[dict_size, 32],
param_attr='eager_emb.w',
is_sparse=False,
)
embs3 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs3.append(emb_rlt)
embs3 = layers.concat(input=embs3, axis=1)
nce1 = nn.NCE(
num_total_classes=dict_size,
dim=embs3.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='eager_nce1.w',
bias_attr='eager_nce1.b',
sample_weight=sample_weights,
)
nce2 = nn.NCE(
num_total_classes=dict_size,
dim=embs3.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr=weight_attr,
bias_attr='eager_nce2.b',
sample_weight=sample_weights,
)
wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
nce1_loss = nce1(embs3, wl)
nce2_loss = nce2(embs3, wl)
self.assertFalse(
np.array_equal(nce1_loss.numpy(), nce2_loss.numpy())
)
nce2.weight.set_value(nce1.weight.numpy())
nce2.bias.set_value(nce1.bias)
nce1_loss = nce1(embs3, wl)
nce2_loss = nce2(embs3, wl)
np.testing.assert_array_equal(
nce1_loss.numpy(), nce2_loss.numpy()
)
nce2.weight = nce1.weight
nce2.bias = nce1.bias
np.testing.assert_array_equal(
nce1.weight.numpy(), nce2.weight.numpy()
)
np.testing.assert_array_equal(
nce1.bias.numpy(), nce2.bias.numpy()
)
custom_weight = np.random.randn(dict_size, 128).astype("float32")
weight_attr = fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
custom_weight
)
)
words = []
for i in range(window_size):
words.append(base.to_variable(inp_word[i]))
sample_weights = layers.fill_constant(
shape=fluid.dygraph.to_variable(np.array([5, 1])),
dtype='float32',
value=1,
)
emb = nn.Embedding(
size=[dict_size, 32], param_attr='emb.w', is_sparse=False
)
embs3 = []
for i in range(window_size):
if i == label_word:
continue
emb_rlt = emb(words[i])
embs3.append(emb_rlt)
embs3 = layers.concat(input=embs3, axis=1)
nce1 = nn.NCE(
num_total_classes=dict_size,
dim=embs3.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr='nce1.w',
bias_attr='nce1.b',
sample_weight=sample_weights,
)
nce2 = nn.NCE(
num_total_classes=dict_size,
dim=embs3.shape[1],
num_neg_samples=2,
sampler="custom_dist",
custom_dist=nid_freq_arr.tolist(),
seed=seed,
param_attr=weight_attr,
bias_attr='nce2.b',
sample_weight=sample_weights,
)
wl = fluid.layers.unsqueeze(words[label_word], axes=[0])
nce1_loss = nce1(embs3, wl)
nce2_loss = nce2(embs3, wl)
self.assertFalse(
np.array_equal(nce1_loss.numpy(), nce2_loss.numpy())
)
nce2.weight.set_value(nce1.weight.numpy())
nce2.bias.set_value(nce1.bias)
nce1_loss = nce1(embs3, wl)
nce2_loss = nce2(embs3, wl)
np.testing.assert_array_equal(nce1_loss.numpy(), nce2_loss.numpy())
nce2.weight = nce1.weight
nce2.bias = nce1.bias
np.testing.assert_array_equal(
nce1.weight.numpy(), nce2.weight.numpy()
)
np.testing.assert_array_equal(nce1.bias.numpy(), nce2.bias.numpy())
def test_one_hot(self): def test_one_hot(self):
with self.dynamic_graph(): with self.dynamic_graph():
with _test_eager_guard(): with _test_eager_guard():
......
...@@ -330,58 +330,5 @@ class TestNCE_OpError(unittest.TestCase): ...@@ -330,58 +330,5 @@ class TestNCE_OpError(unittest.TestCase):
) )
class TestDygraphNCE_OpError(unittest.TestCase):
def test_NCE_errors(self):
with program_guard(Program(), Program()):
nce = fluid.NCE(20, 5)
input1 = fluid.create_lod_tensor(
np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace()
)
label1 = fluid.layers.data(
name='label1', shape=[-1, 4], dtype="int64"
)
# the input(input) of NCE layer must be Variable.
self.assertRaises(TypeError, nce, input1, label1)
input2 = fluid.layers.data(
name='input2', shape=[-1, 4], dtype="float32"
)
label2 = fluid.create_lod_tensor(
np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace()
)
# the input(label) of NCE layer must be Variable.
self.assertRaises(TypeError, nce, input2, label2)
input3 = fluid.layers.data(
name='input3', shape=[-1, 4], dtype="float16"
)
label3 = fluid.layers.data(
name='label3', shape=[-1, 1], dtype="int64"
)
# the data type of input(input) must be float32 or float64.
self.assertRaises(TypeError, nce, input3, label3)
input4 = fluid.layers.data(
name='input4', shape=[-1, 4], dtype="float32"
)
label4 = fluid.layers.data(
name='label4', shape=[-1, 1], dtype="int32"
)
# the data type of input(label) must be int64.
self.assertRaises(TypeError, nce, input4, label4)
input5 = fluid.layers.data(
name='input5', shape=[-1, 4], dtype="float32"
)
label5 = fluid.layers.data(
name='label5', shape=[-1, 1], dtype="int64"
)
sample_weight = fluid.create_lod_tensor(
np.array([0.0, 3.0, 2.0, 4.0]), [[1, 1, 2]], fluid.CPUPlace()
)
# the sample_weight of nce must be Variable or None.
self.assertRaises(TypeError, nce, input5, label5, sample_weight)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册