提交 56e2729c 编写于 作者: G guosheng

Remove hapi.text apis' reuse parameter args for coverage.

test=develop
上级 6e962618
...@@ -16,8 +16,9 @@ from paddle.fluid.dygraph.nn import Linear, Embedding ...@@ -16,8 +16,9 @@ from paddle.fluid.dygraph.nn import Linear, Embedding
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
import numpy as np import numpy as np
from hapi.model import Model from hapi.model import Model
from hapi.text.text import GRUEncoder as BiGRUEncoder from hapi.text.text import _GRUEncoder as GRUEncoder
from hapi.text.test import BOWEncoder, CNNEncoder, GRUEncoder from hapi.text.text import _GRUEncoder as BiGRUEncoder
from hapi.text.test import BOWEncoder, CNNEncoder
class CNN(Model): class CNN(Model):
......
...@@ -28,47 +28,6 @@ from hapi.model import Model, Input, set_device ...@@ -28,47 +28,6 @@ from hapi.model import Model, Input, set_device
from hapi.text.text import * from hapi.text.text import *
def sigmoid(x):
return 1. / (1. + np.exp(-x))
def tanh(x):
return 2. * sigmoid(2. * x) - 1.
def lstm_step(step_in, pre_hidden, pre_cell, gate_w, gate_b, forget_bias=1.0):
concat_1 = np.concatenate([step_in, pre_hidden], 1)
gate_input = np.matmul(concat_1, gate_w)
gate_input += gate_b
i, j, f, o = np.split(gate_input, indices_or_sections=4, axis=1)
new_cell = pre_cell * sigmoid(f + forget_bias) + sigmoid(i) * tanh(j)
new_hidden = tanh(new_cell) * sigmoid(o)
return new_hidden, new_cell
def gru_step(step_in, pre_hidden, gate_w, gate_b, candidate_w, candidate_b):
concat_1 = np.concatenate([step_in, pre_hidden], 1)
gate_input = np.matmul(concat_1, gate_w)
gate_input += gate_b
gate_input = sigmoid(gate_input)
r, u = np.split(gate_input, indices_or_sections=2, axis=1)
r_hidden = r * pre_hidden
candidate = np.matmul(np.concatenate([step_in, r_hidden], 1), candidate_w)
candidate += candidate_b
c = tanh(candidate)
new_hidden = u * pre_hidden + (1 - u) * c
return new_hidden
class ModuleApiTest(unittest.TestCase): class ModuleApiTest(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
......
...@@ -37,9 +37,6 @@ from hapi.text.text import TransformerDecoder as TransformerDecoder ...@@ -37,9 +37,6 @@ from hapi.text.text import TransformerDecoder as TransformerDecoder
from hapi.text.text import TransformerCell as TransformerCell from hapi.text.text import TransformerCell as TransformerCell
from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
from hapi.text.text import GRUCell as GRUCell
from hapi.text.text import GRUEncoderCell as GRUEncoderCell
from hapi.text.text import BiGRU as BiGRU
from hapi.text.text import LinearChainCRF as LinearChainCRF from hapi.text.text import LinearChainCRF as LinearChainCRF
from hapi.text.text import CRFDecoding as CRFDecoding from hapi.text.text import CRFDecoding as CRFDecoding
from hapi.text.text import SequenceTagging as SequenceTagging from hapi.text.text import SequenceTagging as SequenceTagging
...@@ -16,33 +16,22 @@ from __future__ import absolute_import ...@@ -16,33 +16,22 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import os import copy
import collections
import six import six
import sys import sys
if six.PY2: from functools import partial, reduce
reload(sys)
sys.setdefaultencoding('utf8')
import ast
import time
import argparse as argparse
import numpy as np import numpy as np
import multiprocessing
import collections
import copy
from functools import partial, reduce
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers.utils as utils import paddle.fluid.layers.utils as utils
from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
from paddle.fluid.dygraph import Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
from paddle.fluid.data_feeder import convert_dtype
from paddle.fluid import layers from paddle.fluid import layers
from paddle.fluid.dygraph import Layer
from paddle.fluid.layers import BeamSearchDecoder from paddle.fluid.layers import BeamSearchDecoder
from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
from paddle.fluid.data_feeder import convert_dtype
__all__ = [ __all__ = [
'RNNCell', 'RNNCell',
...@@ -72,7 +61,6 @@ __all__ = [ ...@@ -72,7 +61,6 @@ __all__ = [
'LinearChainCRF', 'LinearChainCRF',
'CRFDecoding', 'CRFDecoding',
'SequenceTagging', 'SequenceTagging',
'GRUEncoder',
] ]
...@@ -234,25 +222,6 @@ class BasicLSTMCell(RNNCell): ...@@ -234,25 +222,6 @@ class BasicLSTMCell(RNNCell):
forget_bias(float, optional): forget bias used when computing forget gate. forget_bias(float, optional): forget bias used when computing forget gate.
Default 1.0 Default 1.0
dtype(string, optional): The data type used in this cell. Default float32. dtype(string, optional): The data type used in this cell. Default float32.
forget_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
as keys, and the corresponding values should be instances of Parameter
which represent :math:`W_{x_{f}}, W_{h_{f}}, b_{f}` and have shape
[input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
separately. It is used for reusing and sharing weights when provided,
otherwise create these parameters. Note that parameters from input
gate, forget gate and cell would be concatenated in implementation.
input_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
and the corresponding values should be instances of Parameter which
represent :math:`W_{x_{i}}, W_{h_{i}}, b_{i}` separately. It has the
same usage as :attr:`forget_gate_weights`.
output_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
and the corresponding values should be instances of Parameter which
represent :math:`W_{x_{o}}, W_{h_{o}}, b_{o}` separately. It has the
same usage as :attr:`forget_gate_weights`.
cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
and the corresponding values should be instances of Parameter which
represent :math:`W_{x_{c}}, W_{h_{c}}, b_{c}` separately. It has the
same usage as :attr:`forget_gate_weights`.
""" """
def __init__(self, def __init__(self,
...@@ -263,19 +232,7 @@ class BasicLSTMCell(RNNCell): ...@@ -263,19 +232,7 @@ class BasicLSTMCell(RNNCell):
gate_activation=None, gate_activation=None,
activation=None, activation=None,
forget_bias=1.0, forget_bias=1.0,
dtype='float32', dtype='float32'):
forget_gate_weights={"w": None,
"h": None,
"b": None},
input_gate_weights={"w": None,
"h": None,
"b": None},
output_gate_weights={"w": None,
"h": None,
"b": None},
cell_weights={"w": None,
"h": None,
"b": None}):
super(BasicLSTMCell, self).__init__() super(BasicLSTMCell, self).__init__()
self._hidden_size = hidden_size self._hidden_size = hidden_size
...@@ -290,20 +247,6 @@ class BasicLSTMCell(RNNCell): ...@@ -290,20 +247,6 @@ class BasicLSTMCell(RNNCell):
self._dtype = dtype self._dtype = dtype
self._input_size = input_size self._input_size = input_size
self.use_customized_weight = False
for _weights in [
forget_gate_weights, input_gate_weights, output_gate_weights,
cell_weights
]:
for _key in _weights:
if _weights[_key] is not None:
self.use_customized_weight = True
break
if self.use_customized_weight:
break
if not self.use_customized_weight:
self._weight = self.create_parameter( self._weight = self.create_parameter(
attr=self._param_attr, attr=self._param_attr,
shape=[ shape=[
...@@ -316,199 +259,31 @@ class BasicLSTMCell(RNNCell): ...@@ -316,199 +259,31 @@ class BasicLSTMCell(RNNCell):
shape=[4 * self._hidden_size], shape=[4 * self._hidden_size],
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True)
else:
if "w" in forget_gate_weights and forget_gate_weights[
"w"] is not None:
self.fg_w = forget_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_forget_gate_w"
else:
tmp_param_attr = self._param_attr
self.fg_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in forget_gate_weights and forget_gate_weights[
"h"] is not None:
self.fg_h = forget_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_forget_gate_h"
else:
tmp_param_attr = self._param_attr
self.fg_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in forget_gate_weights and forget_gate_weights[
"b"] is not None:
self.fg_b = forget_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_forget_gate_b"
else:
tmp_param_attr = self._bias_attr
self.fg_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
if "w" in input_gate_weights and input_gate_weights[
"w"] is not None:
self.ig_w = input_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_input_gate_w"
else:
tmp_param_attr = self._param_attr
self.ig_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in input_gate_weights and input_gate_weights[
"h"] is not None:
self.ig_h = input_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_input_gate_h"
else:
tmp_param_attr = self._param_attr
self.ig_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in input_gate_weights and input_gate_weights[
"b"] is not None:
self.ig_b = input_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_input_gate_b"
else:
tmp_param_attr = self._bias_attr
self.ig_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
if "w" in output_gate_weights and output_gate_weights[
"w"] is not None:
self.og_w = output_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_output_gate_w"
else:
tmp_param_attr = self._param_attr
self.og_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in output_gate_weights and output_gate_weights[
"h"] is not None:
self.og_h = output_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_output_gate_h"
else:
tmp_param_attr = self._param_attr
self.og_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in output_gate_weights and output_gate_weights[ def forward(self, inputs, states):
"b"] is not None: """
self.og_b = output_gate_weights["b"] Performs single step LSTM calculations.
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_output_gate_b"
else:
tmp_param_attr = self._bias_attr
self.og_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
if "w" in cell_weights and cell_weights["w"] is not None:
self.c_w = cell_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_cell_w"
else:
tmp_param_attr = self._param_attr
self.c_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in cell_weights and cell_weights["h"] is not None:
self.c_h = cell_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_cell_h"
else:
tmp_param_attr = self._param_attr
self.c_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in cell_weights and cell_weights["b"] is not None:
self.c_b = cell_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_cell_b"
else:
tmp_param_attr = self._bias_attr
self.c_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, input, state):
if self.use_customized_weight:
weight_w = fluid.layers.concat(
[self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
weight_h = fluid.layers.concat(
[self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
_weight = fluid.layers.concat([weight_w, weight_h], axis=0)
_bias = fluid.layers.concat(
[self.ig_b, self.c_b, self.fg_b, self.og_b])
else:
_weight = self._weight
_bias = self._bias
pre_hidden, pre_cell = state Parameters:
concat_input_hidden = layers.concat([input, pre_hidden], 1) inputs (Variable): A tensor with shape `[batch_size, input_size]`,
gate_input = layers.matmul(x=concat_input_hidden, y=_weight) corresponding to :math:`x_t` in the formula. The data type
should be float32 or float64.
states (Variable): A list of containing two tensors, each shaped
`[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
in the formula. The data type should be float32 or float64.
gate_input = layers.elementwise_add(gate_input, _bias) Returns:
tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
a tensor with shape `[batch_size, hidden_size]`, corresponding \
to :math:`h_{t}` in the formula; `new_states` is a list containing \
two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
to :math:`h_{t}, c_{t}` in the formula. The data type of these \
tensors all is same as that of `states`.
"""
pre_hidden, pre_cell = states
concat_input_hidden = layers.concat([inputs, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
gate_input = layers.elementwise_add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
new_cell = layers.elementwise_add( new_cell = layers.elementwise_add(
layers.elementwise_mul( layers.elementwise_mul(
...@@ -564,21 +339,6 @@ class BasicGRUCell(RNNCell): ...@@ -564,21 +339,6 @@ class BasicGRUCell(RNNCell):
GRU, that is :math:`act_c` in the formula. Default: None, GRU, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'. representing for 'fluid.layers.tanh'.
dtype(string, optional): The data type used in this cell. Default float32. dtype(string, optional): The data type used in this cell. Default float32.
update_gate_weights (dict, optional): A dict includes `w`, `h` and `b`
as keys, and the corresponding values should be instances of Parameter
which represent :math:`W_{ux}, W_{uh}, b_{u}` and have shape
[input_size, hidden_size], [hidden_size, hidden_size], [hidden_size]
separately. It is used for reusing and sharing weights when provided,
otherwise create these parameters. Note that parameters from update
gate and reset gate would be concatenated in implementation.
reset_gate_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
and the corresponding values should be instances of Parameter which
represent :math:`W_{rx}, W_{rh}, b_{r}` separately. It has the
same usage as :attr:`update_gate_weights`.
cell_weights (dict, optional): A dict includes `w`, `h` and `b` as keys,
and the corresponding values should be instances of Parameter which
represent :math:`W_{cx}, W_{ch}, b_{c}`` separately. It has the
same usage as :attr:`update_gate_weights`.
""" """
def __init__(self, def __init__(self,
...@@ -588,16 +348,7 @@ class BasicGRUCell(RNNCell): ...@@ -588,16 +348,7 @@ class BasicGRUCell(RNNCell):
bias_attr=None, bias_attr=None,
gate_activation=None, gate_activation=None,
activation=None, activation=None,
dtype='float32', dtype='float32'):
update_gate_weights={"w": None,
"h": None,
"b": None},
reset_gate_weights={"w": None,
"h": None,
"b": None},
cell_weights={"w": None,
"h": None,
"b": None}):
super(BasicGRUCell, self).__init__() super(BasicGRUCell, self).__init__()
self._input_size = input_size self._input_size = input_size
self._hidden_size = hidden_size self._hidden_size = hidden_size
...@@ -607,20 +358,6 @@ class BasicGRUCell(RNNCell): ...@@ -607,20 +358,6 @@ class BasicGRUCell(RNNCell):
self._activation = activation or layers.tanh self._activation = activation or layers.tanh
self._dtype = dtype self._dtype = dtype
assert isinstance(update_gate_weights, dict)
assert isinstance(reset_gate_weights, dict)
assert isinstance(cell_weights, dict)
self.use_customized_weight = False
for _weights in [
update_gate_weights, reset_gate_weights, cell_weights
]:
for _key in _weights:
if _weights[_key] is not None:
self.use_customized_weight = True
if self.use_customized_weight:
break
if self._param_attr is not None and self._param_attr.name is not None: if self._param_attr is not None and self._param_attr.name is not None:
gate_param_attr = copy.deepcopy(self._param_attr) gate_param_attr = copy.deepcopy(self._param_attr)
candidate_param_attr = copy.deepcopy(self._param_attr) candidate_param_attr = copy.deepcopy(self._param_attr)
...@@ -630,7 +367,6 @@ class BasicGRUCell(RNNCell): ...@@ -630,7 +367,6 @@ class BasicGRUCell(RNNCell):
gate_param_attr = self._param_attr gate_param_attr = self._param_attr
candidate_param_attr = self._param_attr candidate_param_attr = self._param_attr
if not self.use_customized_weight:
self._gate_weight = self.create_parameter( self._gate_weight = self.create_parameter(
attr=gate_param_attr, attr=gate_param_attr,
shape=[ shape=[
...@@ -640,9 +376,7 @@ class BasicGRUCell(RNNCell): ...@@ -640,9 +376,7 @@ class BasicGRUCell(RNNCell):
self._candidate_weight = self.create_parameter( self._candidate_weight = self.create_parameter(
attr=candidate_param_attr, attr=candidate_param_attr,
shape=[ shape=[self._input_size + self._hidden_size, self._hidden_size],
self._input_size + self._hidden_size, self._hidden_size
],
dtype=self._dtype) dtype=self._dtype)
if self._bias_attr is not None and self._bias_attr.name is not None: if self._bias_attr is not None and self._bias_attr.name is not None:
...@@ -665,159 +399,30 @@ class BasicGRUCell(RNNCell): ...@@ -665,159 +399,30 @@ class BasicGRUCell(RNNCell):
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True)
else: def forward(self, inputs, states):
"""
# create the parameters of gates in gru Performs single step GRU calculations.
if "w" in update_gate_weights and update_gate_weights[
"w"] is not None:
self.ug_w = update_gate_weights["w"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_update_gate_w"
else:
tmp_param_attr = gate_param_attr
self.ug_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in update_gate_weights and update_gate_weights[
"h"] is not None:
self.ug_h = update_gate_weights["h"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_update_gate_h"
else:
tmp_param_attr = gate_param_attr
self.ug_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in update_gate_weights and update_gate_weights[
"b"] is not None:
self.ug_b = update_gate_weights["b"]
else:
if gate_bias_attr is not None and gate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_bias_attr)
tmp_param_attr.name += "_update_gate_b"
else:
tmp_param_attr = gate_bias_attr
self.ug_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# reset gate parameters
if "w" in reset_gate_weights and reset_gate_weights[
"w"] is not None:
self.rg_w = reset_gate_weights["w"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_reset_gate_w"
else:
tmp_param_attr = gate_param_attr
self.rg_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in reset_gate_weights and reset_gate_weights[
"h"] is not None:
self.rg_h = reset_gate_weights["h"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_reset_gate_h"
else:
tmp_param_attr = gate_param_attr
self.rg_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in reset_gate_weights and reset_gate_weights[
"b"] is not None:
self.rg_b = reset_gate_weights["b"]
else:
if gate_bias_attr is not None and gate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_bias_attr)
tmp_param_attr.name += "_reset_gate_b"
else:
tmp_param_attr = gate_bias_attr
self.rg_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# cell parameters
if "w" in cell_weights and cell_weights["w"] is not None:
self.c_w = cell_weights["w"]
else:
if candidate_param_attr is not None and candidate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_param_attr)
tmp_param_attr.name += "_cell_w"
else:
tmp_param_attr = gate_param_attr
self.c_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in cell_weights and cell_weights["h"] is not None:
self.c_h = cell_weights["h"]
else:
if candidate_param_attr is not None and candidate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_param_attr)
tmp_param_attr.name += "_cell_h"
else:
tmp_param_attr = gate_param_attr
self.c_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in cell_weights and cell_weights["b"] is not None:
self.c_b = cell_weights["b"]
else:
if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_bias_attr)
tmp_param_attr.name += "_cell_b"
else:
tmp_param_attr = gate_bias_attr
self.c_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, input, state):
if self.use_customized_weight: Parameters:
rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0) inputs (Variable): A tensor with shape `[batch_size, input_size]`,
ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0) corresponding to :math:`x_t` in the formula. The data type
_gate_weight = layers.concat([rg_weights, ug_weights], axis=-1) should be float32 or float64.
_candidate_weight = layers.concat([self.c_w, self.c_h], axis=0) states (Variable): A tensor with shape `[batch_size, hidden_size]`.
_gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0) corresponding to :math:`h_{t-1}` in the formula. The data type
_candidate_bias = self.c_b should be float32 or float64.
else:
_gate_weight = self._gate_weight
_gate_bias = self._gate_bias
_candidate_weight = self._candidate_weight
_candidate_bias = self._candidate_bias
pre_hidden = state Returns:
concat_input_hidden = layers.concat([input, pre_hidden], axis=1) tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
`new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
corresponding to :math:`h_t` in the formula. The data type of the \
tensor is same as that of `states`.
"""
pre_hidden = states
concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1)
gate_input = layers.matmul(x=concat_input_hidden, y=_gate_weight) gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = layers.elementwise_add(gate_input, _gate_bias) gate_input = layers.elementwise_add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input) gate_input = self._gate_activation(gate_input)
r, u = layers.split(gate_input, num_or_sections=2, dim=1) r, u = layers.split(gate_input, num_or_sections=2, dim=1)
...@@ -825,8 +430,8 @@ class BasicGRUCell(RNNCell): ...@@ -825,8 +430,8 @@ class BasicGRUCell(RNNCell):
r_hidden = r * pre_hidden r_hidden = r * pre_hidden
candidate = layers.matmul( candidate = layers.matmul(
layers.concat([input, r_hidden], 1), _candidate_weight) layers.concat([inputs, r_hidden], 1), self._candidate_weight)
candidate = layers.elementwise_add(candidate, _candidate_bias) candidate = layers.elementwise_add(candidate, self._candidate_bias)
c = self._activation(candidate) c = self._activation(candidate)
new_hidden = u * pre_hidden + (1 - u) * c new_hidden = u * pre_hidden + (1 - u) * c
...@@ -2650,6 +2255,7 @@ class TransformerCell(Layer): ...@@ -2650,6 +2255,7 @@ class TransformerCell(Layer):
class Embedder(fluid.dygraph.Layer): class Embedder(fluid.dygraph.Layer):
def __init__(self): def __init__(self):
super(Embedder, self).__init__()
self.word_embedder = Embedding(size=[1000, 128]) self.word_embedder = Embedding(size=[1000, 128])
self.pos_embedder = Embedding(size=[500, 128]) self.pos_embedder = Embedding(size=[500, 128])
...@@ -2999,11 +2605,7 @@ class PrePostProcessLayer(Layer): ...@@ -2999,11 +2605,7 @@ class PrePostProcessLayer(Layer):
out = process(x) # [2, 4, 32] out = process(x) # [2, 4, 32]
""" """
def __init__(self, def __init__(self, process_cmd, d_model, dropout_rate=0.1):
process_cmd,
d_model,
dropout_rate=0.1,
reused_layer_norm=None):
super(PrePostProcessLayer, self).__init__() super(PrePostProcessLayer, self).__init__()
self.process_cmd = process_cmd self.process_cmd = process_cmd
self.functors = [] self.functors = []
...@@ -3012,9 +2614,6 @@ class PrePostProcessLayer(Layer): ...@@ -3012,9 +2614,6 @@ class PrePostProcessLayer(Layer):
self.functors.append( self.functors.append(
lambda x, y: x + y if y is not None else x) lambda x, y: x + y if y is not None else x)
elif cmd == "n": # add layer normalization elif cmd == "n": # add layer normalization
if reused_layer_norm is not None:
layer_norm = reused_layer_norm
else:
layer_norm = LayerNorm( layer_norm = LayerNorm(
normalized_shape=d_model, normalized_shape=d_model,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
...@@ -3091,16 +2690,7 @@ class MultiHeadAttention(Layer): ...@@ -3091,16 +2690,7 @@ class MultiHeadAttention(Layer):
output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128] output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128]
""" """
def __init__(self, def __init__(self, d_key, d_value, d_model, n_head, dropout_rate=0.1):
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.0,
reused_query_fc=None,
reused_key_fc=None,
reused_value_fc=None,
reused_proj_fc=None):
super(MultiHeadAttention, self).__init__() super(MultiHeadAttention, self).__init__()
self.n_head = n_head self.n_head = n_head
...@@ -3109,30 +2699,14 @@ class MultiHeadAttention(Layer): ...@@ -3109,30 +2699,14 @@ class MultiHeadAttention(Layer):
self.d_model = d_model self.d_model = d_model
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
if reused_query_fc is not None:
self.q_fc = reused_query_fc
else:
self.q_fc = Linear( self.q_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
if reused_key_fc is not None:
self.k_fc = reused_key_fc
else:
self.k_fc = Linear( self.k_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
if reused_value_fc is not None:
self.v_fc = reused_value_fc
else:
self.v_fc = Linear( self.v_fc = Linear(
input_dim=d_model, input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
output_dim=d_value * n_head,
bias_attr=False)
if reused_proj_fc is not None:
self.proj_fc = reused_proj_fc
else:
self.proj_fc = Linear( self.proj_fc = Linear(
input_dim=d_value * n_head, input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
output_dim=d_model,
bias_attr=False)
def _prepare_qkv(self, queries, keys, values, cache=None): def _prepare_qkv(self, queries, keys, values, cache=None):
""" """
...@@ -3322,23 +2896,11 @@ class FFN(Layer): ...@@ -3322,23 +2896,11 @@ class FFN(Layer):
out = ffn(x) # [2, 4, 32] out = ffn(x) # [2, 4, 32]
""" """
def __init__(self, def __init__(self, d_inner_hid, d_model, dropout_rate=0.1, fc1_act="relu"):
d_inner_hid,
d_model,
dropout_rate=0.1,
fc1_act="relu",
reused_fc1=None,
reused_fc2=None):
super(FFN, self).__init__() super(FFN, self).__init__()
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
if reused_fc1 is not None:
self.fc1 = reused_fc1
else:
self.fc1 = Linear( self.fc1 = Linear(
input_dim=d_model, output_dim=d_inner_hid, act=fc1_act) input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
if reused_fc2 is not None:
self.fc2 = reused_fc2
else:
self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model) self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
def forward(self, x): def forward(self, x):
...@@ -3422,51 +2984,22 @@ class TransformerEncoderLayer(Layer): ...@@ -3422,51 +2984,22 @@ class TransformerEncoderLayer(Layer):
relu_dropout=0.1, relu_dropout=0.1,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da", postprocess_cmd="da",
ffn_fc1_act="relu", ffn_fc1_act="relu"):
reused_pre_selatt_layernorm=None,
reused_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_selfatt_layernorm=None,
reused_pre_ffn_layernorm=None,
reused_ffn_weights={"reused_fc1": None,
"reused_fc2": None},
reused_post_ffn_layernorm=None):
super(TransformerEncoderLayer, self).__init__() super(TransformerEncoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_selatt_layernorm) self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
self.self_attn = MultiHeadAttention( attention_dropout)
d_key, self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
d_value, prepostprocess_dropout)
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
self.postprocesser1 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_selfatt_layernorm)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_ffn_layernorm) self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
self.ffn = FFN(d_inner_hid,
d_model,
relu_dropout,
fc1_act=ffn_fc1_act,
reused_fc1=reused_ffn_weights["reused_fc1"],
reused_fc2=reused_ffn_weights["reused_fc2"])
self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_post_ffn_layernorm)
def forward(self, enc_input, attn_bias=None): def forward(self, enc_input, attn_bias=None):
""" """
...@@ -3667,83 +3200,33 @@ class TransformerDecoderLayer(Layer): ...@@ -3667,83 +3200,33 @@ class TransformerDecoderLayer(Layer):
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
prepostprocess_dropout, prepostprocess_dropout=0.1,
attention_dropout, attention_dropout=0.1,
relu_dropout, relu_dropout=0.1,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da", postprocess_cmd="da",
reused_pre_selfatt_layernorm=None, ffn_fc1_act="relu"):
reused_self_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_selfatt_layernorm=None,
reused_pre_crossatt_layernorm=None,
reused_cross_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_crossatt_layernorm=None,
reused_pre_ffn_layernorm=None,
reused_ffn_weights={"reused_fc1": None,
"reused_fc2": None},
reused_post_ffn_layernorm=None):
super(TransformerDecoderLayer, self).__init__() super(TransformerDecoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_selfatt_layernorm) self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
self.self_attn = MultiHeadAttention( attention_dropout)
d_key, self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
d_value, prepostprocess_dropout)
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_self_multihead_att_weights[
"reused_query_fc"],
reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_self_multihead_att_weights[
"reused_value_fc"],
reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
self.postprocesser1 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_selfatt_layernorm)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_crossatt_layernorm) self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
self.cross_attn = MultiHeadAttention( attention_dropout)
d_key, self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
d_value, prepostprocess_dropout)
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_cross_multihead_att_weights[
"reused_query_fc"],
reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_cross_multihead_att_weights[
"reused_value_fc"],
reused_proj_fc=reused_cross_multihead_att_weights[
"reused_proj_fc"])
self.postprocesser2 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_crossatt_layernorm)
self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_ffn_layernorm) self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
self.ffn = FFN(d_inner_hid,
d_model,
relu_dropout,
reused_fc1=reused_ffn_weights["reused_fc1"],
reused_fc2=reused_ffn_weights["reused_fc2"])
self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model, self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_post_ffn_layernorm)
def forward(self, def forward(self,
dec_input, dec_input,
...@@ -3991,101 +3474,6 @@ class TransformerDecoder(Layer): ...@@ -3991,101 +3474,6 @@ class TransformerDecoder(Layer):
} for i in range(self.n_layer)] } for i in range(self.n_layer)]
#TODO: we should merge GRUCell with BasicGRUCell
class GRUCell(RNNCell):
def __init__(self,
input_size,
hidden_size,
param_attr=None,
bias_attr=None,
gate_activation='sigmoid',
candidate_activation='tanh',
origin_mode=False):
super(GRUCell, self).__init__()
self.hidden_size = hidden_size
self.fc_layer = Linear(
input_size, hidden_size * 3, param_attr=param_attr)
self.gru_unit = GRUUnit(
hidden_size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
def forward(self, inputs, states):
# for GRUCell, `step_outputs` and `new_states` both are hidden
x = self.fc_layer(inputs)
hidden, _, _ = self.gru_unit(x, states)
return hidden, hidden
@property
def state_shape(self):
return [self.hidden_size]
#TODO: we should merge GRUCell with BasicGRUCell
class GRUEncoderCell(RNNCell):
def __init__(self,
num_layers,
input_size,
hidden_size,
dropout_prob=0.,
init_scale=0.1):
super(GRUEncoderCell, self).__init__()
self.dropout_prob = dropout_prob
# use add_sublayer to add multi-layers
self.gru_cells = []
for i in range(num_layers):
self.gru_cells.append(
self.add_sublayer(
"gru_%d" % i,
#BasicGRUCell(
GRUCell(
input_size=input_size if i == 0 else hidden_size,
hidden_size=hidden_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))))
def forward(self, step_input, states):
new_states = []
for i, gru_cell in enumerate(self.gru_cells):
out, state = gru_cell(step_input, states[i])
step_input = layers.dropout(
out,
self.dropout_prob,
dropout_implementation='upscale_in_train'
) if self.dropout_prob > 0 else out
new_states.append(step_input)
return step_input, new_states
@property
def state_shape(self):
return [cell.state_shape for cell in self.gru_cells]
class BiGRU(fluid.dygraph.Layer):
def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
super(BiGRU, self).__init__()
self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
init_bound),
is_reverse=False,
time_major=False)
self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
init_bound),
is_reverse=True,
time_major=False)
def forward(self, input_feature):
pre_gru, pre_state = self.gru(input_feature)
gru_r, r_state = self.gru_r(input_feature)
bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
return bi_merge
class LinearChainCRF(Layer): class LinearChainCRF(Layer):
""" """
Computes the negtive log-likelihood of tag sequences in a linear chain CRF. Computes the negtive log-likelihood of tag sequences in a linear chain CRF.
...@@ -4349,7 +3737,7 @@ class CRFDecoding(Layer): ...@@ -4349,7 +3737,7 @@ class CRFDecoding(Layer):
return viterbi_path return viterbi_path
class GRUEncoder(Layer): class _GRUEncoder(Layer):
""" """
A multi-layer bidirectional GRU encoder used by SequenceTagging. A multi-layer bidirectional GRU encoder used by SequenceTagging.
""" """
...@@ -4360,7 +3748,7 @@ class GRUEncoder(Layer): ...@@ -4360,7 +3748,7 @@ class GRUEncoder(Layer):
init_bound, init_bound,
num_layers=1, num_layers=1,
is_bidirection=False): is_bidirection=False):
super(GRUEncoder, self).__init__() super(_GRUEncoder, self).__init__()
self.num_layers = num_layers self.num_layers = num_layers
self.is_bidirection = is_bidirection self.is_bidirection = is_bidirection
self.gru_list = [] self.gru_list = []
...@@ -4475,7 +3863,7 @@ class SequenceTagging(Layer): ...@@ -4475,7 +3863,7 @@ class SequenceTagging(Layer):
initializer=fluid.initializer.Uniform( initializer=fluid.initializer.Uniform(
low=-self.init_bound, high=self.init_bound))) low=-self.init_bound, high=self.init_bound)))
self.gru_encoder = GRUEncoder( self.gru_encoder = _GRUEncoder(
input_dim=self.grnn_hidden_dim, input_dim=self.grnn_hidden_dim,
grnn_hidden_dim=self.grnn_hidden_dim, grnn_hidden_dim=self.grnn_hidden_dim,
init_bound=self.init_bound, init_bound=self.init_bound,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册