未验证 提交 aa02e347 编写于 作者: G Guo Sheng 提交者: GitHub

Add hapi.text and corresponding unit test. (#24457)

* Add hapi.text and corresponding unit test.
test=develop

* Remove hapi.text apis' reuse parameter args for coverage.
test=develop

* Fix TransformerCell and TransformerBeamSearchDecoder example codes.
test=develop

* Fix example codes in hapi.text.
test=develop

* Add some apis in hapi.text into example code white list.
test=develop

* Fix example code of DynamicDecode in hapi.text.
text=develop

* Rename Model.self as model in test_text.py
test=develop
上级 526a2117
......@@ -22,6 +22,7 @@ from . import loss
from . import datasets
from . import distributed
from . import vision
from . import text
logger.setup_logger()
......@@ -33,6 +34,7 @@ __all__ = [
'metrics',
'loss',
'vision',
'text',
]
__all__ += model.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import unittest
import random
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, Linear, Layer
from paddle.fluid.layers import BeamSearchDecoder
from paddle.incubate.hapi.model import Model, Input, set_device
from paddle.incubate.hapi.text import *
class ModuleApiTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls._np_rand_state = np.random.get_state()
cls._py_rand_state = random.getstate()
cls._random_seed = 123
np.random.seed(cls._random_seed)
random.seed(cls._random_seed)
cls.model_cls = type(cls.__name__ + "Model", (Model, ), {
"__init__": cls.model_init_wrapper(cls.model_init),
"forward": cls.model_forward
})
@classmethod
def tearDownClass(cls):
np.random.set_state(cls._np_rand_state)
random.setstate(cls._py_rand_state)
@staticmethod
def model_init_wrapper(func):
def __impl__(self, *args, **kwargs):
Model.__init__(self)
func(self, *args, **kwargs)
return __impl__
@staticmethod
def model_init(model, *args, **kwargs):
raise NotImplementedError(
"model_init acts as `Model.__init__`, thus must implement it")
@staticmethod
def model_forward(model, *args, **kwargs):
return model.module(*args, **kwargs)
def make_inputs(self):
# TODO(guosheng): add default from `self.inputs`
raise NotImplementedError(
"model_inputs makes inputs for model, thus must implement it")
def setUp(self):
"""
For the model which wraps the module to be tested:
Set input data by `self.inputs` list
Set init argument values by `self.attrs` list/dict
Set model parameter values by `self.param_states` dict
Set expected output data by `self.outputs` list
We can create a model instance and run once with these.
"""
self.inputs = []
self.attrs = {}
self.param_states = {}
self.outputs = []
def _calc_output(self, place, mode="test", dygraph=True):
if dygraph:
fluid.enable_dygraph(place)
else:
fluid.disable_dygraph()
fluid.default_main_program().random_seed = self._random_seed
fluid.default_startup_program().random_seed = self._random_seed
model = self.model_cls(**self.attrs) if isinstance(
self.attrs, dict) else self.model_cls(*self.attrs)
model.prepare(inputs=self.make_inputs(), device=place)
if self.param_states:
model.load(self.param_states, optim_state=None)
return model.test_batch(self.inputs)
def check_output_with_place(self, place, mode="test"):
dygraph_output = self._calc_output(place, mode, dygraph=True)
stgraph_output = self._calc_output(place, mode, dygraph=False)
expect_output = getattr(self, "outputs", None)
for actual_t, expect_t in zip(dygraph_output, stgraph_output):
self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0))
if expect_output:
for actual_t, expect_t in zip(dygraph_output, expect_output):
self.assertTrue(
np.allclose(
actual_t, expect_t, rtol=1e-5, atol=0))
def check_output(self):
devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"]
for device in devices:
place = set_device(device)
self.check_output_with_place(place)
class TestBasicLSTM(ModuleApiTest):
def setUp(self):
# TODO(guosheng): Change to big size. Currently bigger hidden size for
# LSTM would fail, the second static graph run might get diff output
# with others.
shape = (2, 4, 16)
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"input_size": 16, "hidden_size": 16}
self.param_states = {}
@staticmethod
def model_init(model, input_size, hidden_size):
model.lstm = RNN(
BasicLSTMCell(
input_size,
hidden_size,
param_attr=fluid.ParamAttr(name="lstm_weight"),
bias_attr=fluid.ParamAttr(name="lstm_bias")))
@staticmethod
def model_forward(model, inputs):
return model.lstm(inputs)[0]
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[-1].shape[-1]],
"float32",
name="input"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestBasicGRU(ModuleApiTest):
def setUp(self):
shape = (2, 4, 128)
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"input_size": 128, "hidden_size": 128}
self.param_states = {}
@staticmethod
def model_init(model, input_size, hidden_size):
model.gru = RNN(BasicGRUCell(input_size, hidden_size))
@staticmethod
def model_forward(model, inputs):
return model.gru(inputs)[0]
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[-1].shape[-1]],
"float32",
name="input"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestBeamSearch(ModuleApiTest):
def setUp(self):
shape = (8, 32)
self.inputs = [
np.random.random(shape).astype("float32"),
np.random.random(shape).astype("float32")
]
self.outputs = None
self.attrs = {
"vocab_size": 100,
"embed_dim": 32,
"hidden_size": 32,
}
self.param_states = {}
@staticmethod
def model_init(self,
vocab_size,
embed_dim,
hidden_size,
bos_id=0,
eos_id=1,
beam_size=4,
max_step_num=20):
embedder = Embedding(size=[vocab_size, embed_dim])
output_layer = Linear(hidden_size, vocab_size)
cell = BasicLSTMCell(embed_dim, hidden_size)
decoder = BeamSearchDecoder(
cell,
start_token=bos_id,
end_token=eos_id,
beam_size=beam_size,
embedding_fn=embedder,
output_fn=output_layer)
self.beam_search_decoder = DynamicDecode(
decoder, max_step_num=max_step_num, is_test=True)
@staticmethod
def model_forward(model, init_hidden, init_cell):
return model.beam_search_decoder([init_hidden, init_cell])[0]
def make_inputs(self):
inputs = [
Input(
[None, self.inputs[0].shape[-1]], "float32",
name="init_hidden"),
Input(
[None, self.inputs[1].shape[-1]], "float32", name="init_cell"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestTransformerEncoder(ModuleApiTest):
def setUp(self):
self.inputs = [
# encoder input: [batch_size, seq_len, hidden_size]
np.random.random([2, 4, 512]).astype("float32"),
# self attention bias: [batch_size, n_head, seq_len, seq_len]
np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9
]
self.outputs = None
self.attrs = {
"n_layer": 2,
"n_head": 8,
"d_key": 64,
"d_value": 64,
"d_model": 512,
"d_inner_hid": 1024
}
self.param_states = {}
@staticmethod
def model_init(model,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
ffn_fc1_act="relu"):
model.encoder = TransformerEncoder(
n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
prepostprocess_dropout, attention_dropout, relu_dropout,
preprocess_cmd, postprocess_cmd, ffn_fc1_act)
@staticmethod
def model_forward(model, enc_input, attn_bias):
return model.encoder(enc_input, attn_bias)
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[0].shape[-1]],
"float32",
name="enc_input"),
Input(
[None, self.inputs[1].shape[1], None, None],
"float32",
name="attn_bias"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestTransformerDecoder(TestTransformerEncoder):
def setUp(self):
self.inputs = [
# decoder input: [batch_size, seq_len, hidden_size]
np.random.random([2, 4, 512]).astype("float32"),
# encoder output: [batch_size, seq_len, hidden_size]
np.random.random([2, 5, 512]).astype("float32"),
# self attention bias: [batch_size, n_head, seq_len, seq_len]
np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9,
# cross attention bias: [batch_size, n_head, seq_len, seq_len]
np.random.randint(0, 1, [2, 8, 4, 5]).astype("float32") * -1e9
]
self.outputs = None
self.attrs = {
"n_layer": 2,
"n_head": 8,
"d_key": 64,
"d_value": 64,
"d_model": 512,
"d_inner_hid": 1024
}
self.param_states = {}
@staticmethod
def model_init(model,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da"):
model.decoder = TransformerDecoder(
n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
prepostprocess_dropout, attention_dropout, relu_dropout,
preprocess_cmd, postprocess_cmd)
@staticmethod
def model_forward(model,
dec_input,
enc_output,
self_attn_bias,
cross_attn_bias,
caches=None):
return model.decoder(dec_input, enc_output, self_attn_bias,
cross_attn_bias, caches)
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[0].shape[-1]],
"float32",
name="dec_input"),
Input(
[None, None, self.inputs[0].shape[-1]],
"float32",
name="enc_output"),
Input(
[None, self.inputs[-1].shape[1], None, None],
"float32",
name="self_attn_bias"),
Input(
[None, self.inputs[-1].shape[1], None, None],
"float32",
name="cross_attn_bias"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestTransformerBeamSearchDecoder(ModuleApiTest):
def setUp(self):
self.inputs = [
# encoder output: [batch_size, seq_len, hidden_size]
np.random.random([2, 5, 128]).astype("float32"),
# cross attention bias: [batch_size, n_head, seq_len, seq_len]
np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9
]
self.outputs = None
self.attrs = {
"vocab_size": 100,
"n_layer": 2,
"n_head": 2,
"d_key": 64,
"d_value": 64,
"d_model": 128,
"d_inner_hid": 128
}
self.param_states = {}
@staticmethod
def model_init(model,
vocab_size,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
bos_id=0,
eos_id=1,
beam_size=4,
max_step_num=20):
model.beam_size = beam_size
def embeder_init(self, size):
Layer.__init__(self)
self.embedder = Embedding(size)
Embedder = type("Embedder", (Layer, ), {
"__init__": embeder_init,
"forward": lambda self, word, pos: self.embedder(word)
})
embedder = Embedder(size=[vocab_size, d_model])
output_layer = Linear(d_model, vocab_size)
model.decoder = TransformerDecoder(
n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
prepostprocess_dropout, attention_dropout, relu_dropout,
preprocess_cmd, postprocess_cmd)
transformer_cell = TransformerCell(model.decoder, embedder,
output_layer)
model.beam_search_decoder = DynamicDecode(
TransformerBeamSearchDecoder(
transformer_cell, bos_id, eos_id, beam_size,
var_dim_in_state=2),
max_step_num,
is_test=True)
@staticmethod
def model_forward(model, enc_output, trg_src_attn_bias):
caches = model.decoder.prepare_incremental_cache(enc_output)
enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
enc_output, model.beam_size)
trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
trg_src_attn_bias, model.beam_size)
static_caches = model.decoder.prepare_static_cache(enc_output)
rs, _ = model.beam_search_decoder(
inits=caches,
enc_output=enc_output,
trg_src_attn_bias=trg_src_attn_bias,
static_caches=static_caches)
return rs
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[0].shape[-1]],
"float32",
name="enc_output"),
Input(
[None, self.inputs[1].shape[1], None, None],
"float32",
name="trg_src_attn_bias"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestSequenceTagging(ModuleApiTest):
def setUp(self):
self.inputs = [
np.random.randint(0, 100, (2, 8)).astype("int64"),
np.random.randint(1, 8, (2)).astype("int64"),
np.random.randint(0, 5, (2, 8)).astype("int64")
]
self.outputs = None
self.attrs = {"vocab_size": 100, "num_labels": 5}
self.param_states = {}
@staticmethod
def model_init(model,
vocab_size,
num_labels,
word_emb_dim=128,
grnn_hidden_dim=128,
emb_learning_rate=0.1,
crf_learning_rate=0.1,
bigru_num=2,
init_bound=0.1):
model.tagger = SequenceTagging(vocab_size, num_labels, word_emb_dim,
grnn_hidden_dim, emb_learning_rate,
crf_learning_rate, bigru_num, init_bound)
@staticmethod
def model_forward(model, word, lengths, target=None):
return model.tagger(word, lengths, target)
def make_inputs(self):
inputs = [
Input(
[None, None], "int64", name="word"),
Input(
[None], "int64", name="lengths"),
Input(
[None, None], "int64", name="target"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestSequenceTaggingInfer(TestSequenceTagging):
def setUp(self):
super(TestSequenceTaggingInfer, self).setUp()
self.inputs = self.inputs[:2] # remove target
def make_inputs(self):
inputs = super(TestSequenceTaggingInfer,
self).make_inputs()[:2] # remove target
return inputs
class TestStackedRNN(ModuleApiTest):
def setUp(self):
shape = (2, 4, 16)
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
self.param_states = {}
@staticmethod
def model_init(model, input_size, hidden_size, num_layers):
cells = [
BasicLSTMCell(input_size, hidden_size),
BasicLSTMCell(hidden_size, hidden_size)
]
stacked_cell = StackedRNNCell(cells)
model.lstm = RNN(stacked_cell)
@staticmethod
def model_forward(self, inputs):
return self.lstm(inputs)[0]
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[-1].shape[-1]],
"float32",
name="input"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestLSTM(ModuleApiTest):
def setUp(self):
shape = (2, 4, 16)
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
self.param_states = {}
@staticmethod
def model_init(model, input_size, hidden_size, num_layers):
model.lstm = LSTM(input_size, hidden_size, num_layers=num_layers)
@staticmethod
def model_forward(model, inputs):
return model.lstm(inputs)[0]
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[-1].shape[-1]],
"float32",
name="input"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestBiLSTM(ModuleApiTest):
def setUp(self):
shape = (2, 4, 16)
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
self.param_states = {}
@staticmethod
def model_init(model,
input_size,
hidden_size,
num_layers,
merge_mode="concat",
merge_each_layer=False):
model.bilstm = BidirectionalLSTM(
input_size,
hidden_size,
num_layers=num_layers,
merge_mode=merge_mode,
merge_each_layer=merge_each_layer)
@staticmethod
def model_forward(model, inputs):
return model.bilstm(inputs)[0]
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[-1].shape[-1]],
"float32",
name="input"),
]
return inputs
def test_check_output_merge0(self):
self.check_output()
def test_check_output_merge1(self):
self.attrs["merge_each_layer"] = True
self.check_output()
class TestGRU(ModuleApiTest):
def setUp(self):
shape = (2, 4, 64)
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
self.param_states = {}
@staticmethod
def model_init(model, input_size, hidden_size, num_layers):
model.gru = GRU(input_size, hidden_size, num_layers=num_layers)
@staticmethod
def model_forward(model, inputs):
return model.gru(inputs)[0]
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[-1].shape[-1]],
"float32",
name="input"),
]
return inputs
def test_check_output(self):
self.check_output()
class TestBiGRU(ModuleApiTest):
def setUp(self):
shape = (2, 4, 64)
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
self.param_states = {}
@staticmethod
def model_init(model,
input_size,
hidden_size,
num_layers,
merge_mode="concat",
merge_each_layer=False):
model.bigru = BidirectionalGRU(
input_size,
hidden_size,
num_layers=num_layers,
merge_mode=merge_mode,
merge_each_layer=merge_each_layer)
@staticmethod
def model_forward(model, inputs):
return model.bigru(inputs)[0]
def make_inputs(self):
inputs = [
Input(
[None, None, self.inputs[-1].shape[-1]],
"float32",
name="input"),
]
return inputs
def test_check_output_merge0(self):
self.check_output()
def test_check_output_merge1(self):
self.attrs["merge_each_layer"] = True
self.check_output()
class TestCNNEncoder(ModuleApiTest):
def setUp(self):
shape = (2, 32, 8) # [N, C, H]
self.inputs = [np.random.random(shape).astype("float32")]
self.outputs = None
self.attrs = {"num_channels": 32, "num_filters": 64, "num_layers": 2}
self.param_states = {}
@staticmethod
def model_init(model, num_channels, num_filters, num_layers):
model.cnn_encoder = CNNEncoder(
num_layers=2,
num_channels=num_channels,
num_filters=num_filters,
filter_size=[2, 3],
pool_size=[7, 6])
@staticmethod
def model_forward(model, inputs):
return model.cnn_encoder(inputs)
def make_inputs(self):
inputs = [
Input(
[None, self.inputs[-1].shape[1], None], "float32",
name="input"),
]
return inputs
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import text
from .text import *
__all__ = text.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import collections
import six
import sys
from functools import partial, reduce
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers.utils as utils
from paddle.fluid import layers
from paddle.fluid.layers import BeamSearchDecoder
from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
from paddle.fluid.data_feeder import convert_dtype
__all__ = [
'RNNCell',
'BasicLSTMCell',
'BasicGRUCell',
'RNN',
'BidirectionalRNN',
'StackedRNNCell',
'StackedLSTMCell',
'LSTM',
'BidirectionalLSTM',
'StackedGRUCell',
'GRU',
'BidirectionalGRU',
'DynamicDecode',
'BeamSearchDecoder',
'Conv1dPoolLayer',
'CNNEncoder',
'MultiHeadAttention',
'FFN',
'TransformerEncoderLayer',
'TransformerEncoder',
'TransformerDecoderLayer',
'TransformerDecoder',
'TransformerCell',
'TransformerBeamSearchDecoder',
'LinearChainCRF',
'CRFDecoding',
'SequenceTagging',
]
class RNNCell(Layer):
"""
RNNCell is the base class for abstraction representing the calculations
mapping the input and state to the output and new state. It is suitable to
and mostly used in RNN.
"""
def get_initial_states(self,
batch_ref,
shape=None,
dtype=None,
init_value=0,
batch_dim_idx=0):
"""
Generate initialized states according to provided shape, data type and
value.
Parameters:
batch_ref: A (possibly nested structure of) tensor variable[s].
The first dimension of the tensor will be used as batch size to
initialize states.
shape: A (possibly nested structure of) shape[s], where a shape is
represented as a list/tuple of integer). -1(for batch size) will
beautomatically inserted if shape is not started with it. If None,
property `state_shape` will be used. The default value is None.
dtype: A (possibly nested structure of) data type[s]. The structure
must be same as that of `shape`, except when all tensors' in states
has the same data type, a single data type can be used. If None and
property `cell.state_shape` is not available, float32 will be used
as the data type. The default value is None.
init_value: A float value used to initialize states.
batch_dim_idx: An integer indicating which dimension of the tensor in
inputs represents batch size. The default value is 0.
Returns:
Variable: tensor variable[s] packed in the same structure provided \
by shape, representing the initialized states.
"""
# TODO: use inputs and batch_size
batch_ref = flatten(batch_ref)[0]
def _is_shape_sequence(seq):
if sys.version_info < (3, ):
integer_types = (
int,
long, )
else:
integer_types = (int, )
"""For shape, list/tuple of integer is the finest-grained objection"""
if (isinstance(seq, list) or isinstance(seq, tuple)):
if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
seq, True):
return False
# TODO: Add check for the illegal
if isinstance(seq, dict):
return True
return (isinstance(seq, collections.Sequence) and
not isinstance(seq, six.string_types))
class Shape(object):
def __init__(self, shape):
self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
# nested structure of shapes
states_shapes = self.state_shape if shape is None else shape
is_sequence_ori = utils.is_sequence
utils.is_sequence = _is_shape_sequence
states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
utils.is_sequence = is_sequence_ori
# nested structure of dtypes
try:
states_dtypes = self.state_dtype if dtype is None else dtype
except NotImplementedError: # use fp32 as default
states_dtypes = "float32"
if len(flatten(states_dtypes)) == 1:
dtype = flatten(states_dtypes)[0]
states_dtypes = map_structure(lambda shape: dtype, states_shapes)
init_states = map_structure(
lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
input=batch_ref,
shape=shape.shape,
dtype=dtype,
value=init_value,
input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
return init_states
@property
def state_shape(self):
"""
Abstract method (property).
Used to initialize states.
A (possiblely nested structure of) shape[s], where a shape is represented
as a list/tuple of integers (-1 for batch size would be automatically
inserted into a shape if shape is not started with it).
Not necessary to be implemented if states are not initialized by
`get_initial_states` or the `shape` argument is provided when using
`get_initial_states`.
"""
raise NotImplementedError(
"Please add implementaion for `state_shape` in the used cell.")
@property
def state_dtype(self):
"""
Abstract method (property).
Used to initialize states.
A (possiblely nested structure of) data types[s]. The structure must be
same as that of `shape`, except when all tensors' in states has the same
data type, a signle data type can be used.
Not necessary to be implemented if states are not initialized
by `get_initial_states` or the `dtype` argument is provided when using
`get_initial_states`.
"""
raise NotImplementedError(
"Please add implementaion for `state_dtype` in the used cell.")
class BasicLSTMCell(RNNCell):
"""
Long-Short Term Memory(LSTM) RNN cell.
The formula used is as follows:
.. math::
i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
h_{t} & = o_{t} act_c (c_{t})
Please refer to `An Empirical Exploration of Recurrent Network Architectures
<http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
Parameters:
input_size (int): The input size in the LSTM cell.
hidden_size (int): The hidden size in the LSTM cell.
param_attr(ParamAttr, optional): The parameter attribute for the learnable
weight matrix. Default: None.
bias_attr (ParamAttr, optional): The parameter attribute for the bias
of LSTM. Default: None.
gate_activation (function, optional): The activation function for gates
of LSTM, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
LSTM, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
forget_bias(float, optional): forget bias used when computing forget gate.
Default 1.0
dtype(string, optional): The data type used in this cell. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import BasicLSTMCell, RNN
inputs = paddle.rand((2, 4, 32))
cell = BasicLSTMCell(input_size=32, hidden_size=64)
rnn = RNN(cell=cell)
outputs, _ = rnn(inputs) # [2, 4, 64]
"""
def __init__(self,
input_size,
hidden_size,
param_attr=None,
bias_attr=None,
gate_activation=None,
activation=None,
forget_bias=1.0,
dtype='float32'):
super(BasicLSTMCell, self).__init__()
self._hidden_size = hidden_size
self._param_attr = param_attr
self._bias_attr = bias_attr
self._gate_activation = gate_activation or layers.sigmoid
self._activation = activation or layers.tanh
# TODO(guosheng): find better way to resolve constants in __init__
self._forget_bias = layers.create_global_var(
shape=[1], dtype=dtype, value=forget_bias, persistable=True)
self._forget_bias.stop_gradient = True
self._dtype = dtype
self._input_size = input_size
self._weight = self.create_parameter(
attr=self._param_attr,
shape=[
self._input_size + self._hidden_size, 4 * self._hidden_size
],
dtype=self._dtype)
self._bias = self.create_parameter(
attr=self._bias_attr,
shape=[4 * self._hidden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, inputs, states):
"""
Performs single step LSTM calculations.
Parameters:
inputs (Variable): A tensor with shape `[batch_size, input_size]`,
corresponding to :math:`x_t` in the formula. The data type
should be float32 or float64.
states (Variable): A list of containing two tensors, each shaped
`[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
in the formula. The data type should be float32 or float64.
Returns:
tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
a tensor with shape `[batch_size, hidden_size]`, corresponding \
to :math:`h_{t}` in the formula; `new_states` is a list containing \
two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
to :math:`h_{t}, c_{t}` in the formula. The data type of these \
tensors all is same as that of `states`.
"""
pre_hidden, pre_cell = states
concat_input_hidden = layers.concat([inputs, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
gate_input = layers.elementwise_add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
new_cell = layers.elementwise_add(
layers.elementwise_mul(
pre_cell,
self._gate_activation(
layers.elementwise_add(f, self._forget_bias))),
layers.elementwise_mul(
self._gate_activation(i), self._activation(j)))
new_hidden = self._activation(new_cell) * self._gate_activation(o)
return new_hidden, [new_hidden, new_cell]
@property
def state_shape(self):
"""
The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
(-1 for batch size would be automatically inserted into shape). These two
shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
"""
return [[self._hidden_size], [self._hidden_size]]
class BasicGRUCell(RNNCell):
"""
Gated Recurrent Unit (GRU) RNN cell.
The formula for GRU used is as follows:
.. math::
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
Please refer to `An Empirical Exploration of Recurrent Network Architectures
<http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
Parameters:
input_size (int): The input size for the first GRU cell.
hidden_size (int): The hidden size for every GRU cell.
param_attr(ParamAttr, optional): The parameter attribute for the learnable
weight matrix. Default: None.
bias_attr (ParamAttr, optional): The parameter attribute for the bias
of LSTM. Default: None.
gate_activation (function, optional): The activation function for gates
of GRU, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
GRU, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
dtype(string, optional): The data type used in this cell. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import BasicGRUCell, RNN
inputs = paddle.rand((2, 4, 32))
cell = BasicGRUCell(input_size=32, hidden_size=64)
rnn = RNN(cell=cell)
outputs, _ = rnn(inputs) # [2, 4, 64]
"""
def __init__(self,
input_size,
hidden_size,
param_attr=None,
bias_attr=None,
gate_activation=None,
activation=None,
dtype='float32'):
super(BasicGRUCell, self).__init__()
self._input_size = input_size
self._hidden_size = hidden_size
self._param_attr = param_attr
self._bias_attr = bias_attr
self._gate_activation = gate_activation or layers.sigmoid
self._activation = activation or layers.tanh
self._dtype = dtype
if self._param_attr is not None and self._param_attr.name is not None:
gate_param_attr = copy.deepcopy(self._param_attr)
candidate_param_attr = copy.deepcopy(self._param_attr)
gate_param_attr.name += "_gate"
candidate_param_attr.name += "_candidate"
else:
gate_param_attr = self._param_attr
candidate_param_attr = self._param_attr
self._gate_weight = self.create_parameter(
attr=gate_param_attr,
shape=[
self._input_size + self._hidden_size, 2 * self._hidden_size
],
dtype=self._dtype)
self._candidate_weight = self.create_parameter(
attr=candidate_param_attr,
shape=[self._input_size + self._hidden_size, self._hidden_size],
dtype=self._dtype)
if self._bias_attr is not None and self._bias_attr.name is not None:
gate_bias_attr = copy.deepcopy(self._bias_attr)
candidate_bias_attr = copy.deepcopy(self._bias_attr)
gate_bias_attr.name += "_gate"
candidate_bias_attr.name += "_candidate"
else:
gate_bias_attr = self._bias_attr
candidate_bias_attr = self._bias_attr
self._gate_bias = self.create_parameter(
attr=gate_bias_attr,
shape=[2 * self._hidden_size],
dtype=self._dtype,
is_bias=True)
self._candidate_bias = self.create_parameter(
attr=candidate_bias_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, inputs, states):
"""
Performs single step GRU calculations.
Parameters:
inputs (Variable): A tensor with shape `[batch_size, input_size]`,
corresponding to :math:`x_t` in the formula. The data type
should be float32 or float64.
states (Variable): A tensor with shape `[batch_size, hidden_size]`.
corresponding to :math:`h_{t-1}` in the formula. The data type
should be float32 or float64.
Returns:
tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
`new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
corresponding to :math:`h_t` in the formula. The data type of the \
tensor is same as that of `states`.
"""
pre_hidden = states
concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = layers.elementwise_add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input)
r, u = layers.split(gate_input, num_or_sections=2, dim=1)
r_hidden = r * pre_hidden
candidate = layers.matmul(
layers.concat([inputs, r_hidden], 1), self._candidate_weight)
candidate = layers.elementwise_add(candidate, self._candidate_bias)
c = self._activation(candidate)
new_hidden = u * pre_hidden + (1 - u) * c
return new_hidden, new_hidden
@property
def state_shape(self):
"""
The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
size would be automatically inserted into shape). The shape corresponds
to :math:`h_{t-1}`.
"""
return [self._hidden_size]
class RNN(Layer):
"""
RNN creates a recurrent neural network specified by RNNCell `cell`, which
performs :code:`cell.forward()` repeatedly until reaches to the maximum
length of `inputs`.
Parameters:
cell(RNNCell): An instance of `RNNCell`.
is_reverse (bool, optional): Indicate whether to calculate in the reverse
order of input sequences. Default: `False`.
time_major (bool, optional): Indicate the data layout of Tensor included
in `input` and `output` tensors. If `False`, the data layout would
be batch major with shape `[batch_size, sequence_length, ...]`. If
`True`, the data layout would be time major with shape
`[sequence_length, batch_size, ...]`. Default: `False`.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import StackedLSTMCell, RNN
inputs = paddle.rand((2, 4, 32))
cell = StackedLSTMCell(input_size=32, hidden_size=64)
rnn = RNN(cell=cell)
outputs, _ = rnn(inputs) # [2, 4, 64]
"""
def __init__(self, cell, is_reverse=False, time_major=False):
super(RNN, self).__init__()
self.cell = cell
if not hasattr(self.cell, "call"):
self.cell.call = self.cell.forward
self.is_reverse = is_reverse
self.time_major = time_major
self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
1)
def forward(self,
inputs,
initial_states=None,
sequence_length=None,
**kwargs):
"""
Performs :code:`cell.forward()` repeatedly until reaches to the maximum
length of `inputs`.
Parameters:
inputs (Variable): A (possibly nested structure of) tensor variable[s].
The shape of tensor should be `[batch_size, sequence_length, ...]`
for `time_major == False` or `[sequence_length, batch_size, ...]`
for `time_major == True`. It represents the inputs to be unrolled
in RNN.
initial_states (Variable, optional): A (possibly nested structure of)
tensor variable[s], representing the initial state for RNN.
If not provided, `cell.get_initial_states` would be used to produce
the initial state. Default None.
sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
It stores real length of each instance, thus enables users to extract
the last valid state when past a batch element's sequence length for
correctness. If not provided, the paddings would be treated same as
non-padding inputs. Default None.
**kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
Returns:
tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
outputs and states, both are Tensor or nested structure of Tensor. \
`final_outputs` has the same structure and data types as \
the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
`final_states` is the counterpart at last time step of initial states, \
thus has the same structure with it and has tensors with same shapes \
and data types.
"""
if fluid.in_dygraph_mode():
class ArrayWrapper(object):
def __init__(self, x):
self.array = [x]
def append(self, x):
self.array.append(x)
return self
def _maybe_copy(state, new_state, step_mask):
# TODO: use where_op
new_state = fluid.layers.elementwise_mul(
new_state, step_mask,
axis=0) - fluid.layers.elementwise_mul(
state, (step_mask - 1), axis=0)
return new_state
flat_inputs = flatten(inputs)
batch_size, time_steps = (
flat_inputs[0].shape[self.batch_index],
flat_inputs[0].shape[self.time_step_index])
if initial_states is None:
initial_states = self.cell.get_initial_states(
batch_ref=inputs, batch_dim_idx=self.batch_index)
if not self.time_major:
inputs = map_structure(
lambda x: fluid.layers.transpose(x, [1, 0] + list(
range(2, len(x.shape)))), inputs)
if sequence_length is not None:
mask = fluid.layers.sequence_mask(
sequence_length,
maxlen=time_steps,
dtype=flatten(initial_states)[0].dtype)
mask = fluid.layers.transpose(mask, [1, 0])
if self.is_reverse:
inputs = map_structure(
lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
mask = fluid.layers.reverse(
mask, axis=[0]) if sequence_length is not None else None
states = initial_states
outputs = []
for i in range(time_steps):
step_inputs = map_structure(lambda x: x[i], inputs)
step_outputs, new_states = self.cell(step_inputs, states,
**kwargs)
if sequence_length is not None:
new_states = map_structure(
partial(
_maybe_copy, step_mask=mask[i]),
states,
new_states)
states = new_states
outputs = map_structure(
lambda x: ArrayWrapper(x),
step_outputs) if i == 0 else map_structure(
lambda x, x_array: x_array.append(x), step_outputs,
outputs)
final_outputs = map_structure(
lambda x: fluid.layers.stack(x.array, axis=self.time_step_index
), outputs)
if self.is_reverse:
final_outputs = map_structure(
lambda x: fluid.layers.reverse(x, axis=self.time_step_index
), final_outputs)
final_states = new_states
else:
final_outputs, final_states = fluid.layers.rnn(
self.cell,
inputs,
initial_states=initial_states,
sequence_length=sequence_length,
time_major=self.time_major,
is_reverse=self.is_reverse,
**kwargs)
return final_outputs, final_states
class StackedRNNCell(RNNCell):
"""
Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
to implement stacked RNNs.
Parameters:
cells (list|tuple): List of RNN cell instances.
Examples:
.. code-block:: python
from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
stack_rnn = StackedRNNCell(cells)
"""
def __init__(self, cells):
super(StackedRNNCell, self).__init__()
self.cells = []
for i, cell in enumerate(cells):
self.cells.append(self.add_sublayer("cell_%d" % i, cell))
def forward(self, inputs, states, **kwargs):
"""
Performs :code:`cell.forward` for all including cells sequentially.
Each cell's `inputs` is the `outputs` of the previous cell. And each
cell's `states` is the corresponding one in `states`.
Parameters:
inputs (Variable): The inputs for the first cell. Mostly it is a
float32 or float64 tensor with shape `[batch_size, input_size]`.
states (list): A list containing states for all cells orderly.
**kwargs: Additional keyword arguments, which passed to `cell.forward`
for all including cells.
Returns:
tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
`outputs` of the last cell. `new_states` is a list composed \
of all cells' `new_states`, and its structure and data type is \
same as that of `states` argument.
"""
new_states = []
for cell, state in zip(self.cells, states):
outputs, new_state = cell(inputs, state, **kwargs)
inputs = outputs
new_states.append(new_state)
return outputs, new_states
@staticmethod
def stack_param_attr(param_attr, n):
"""
If `param_attr` is a list or tuple, convert every element in it to a
ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
construct a list, and rename every one by appending a increasing index
suffix to avoid having same names when `param_attr` contains a name.
Parameters:
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`.
n (int): The times to repeat to construct a list when `param_attr`
is not a list or tuple.
Returns:
list: A list composed of each including cell's `param_attr`.
"""
if isinstance(param_attr, (list, tuple)):
assert len(param_attr) == n, (
"length of param_attr should be %d when it is a list/tuple" % n)
param_attrs = [
fluid.ParamAttr._to_attr(attr) for attr in param_attr
]
else:
param_attrs = []
attr = fluid.ParamAttr._to_attr(param_attr)
for i in range(n):
attr_i = copy.deepcopy(attr)
if attr.name:
attr_i.name = attr_i.name + "_" + str(i)
param_attrs.append(attr_i)
return param_attrs
@property
def state_shape(self):
"""
The `state_shape` of StackedRNNCell is a list composed of each including
cell's `state_shape`.
Returns:
list: A list composed of each including cell's `state_shape`.
"""
return [cell.state_shape for cell in self.cells]
class StackedLSTMCell(RNNCell):
"""
Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
to implement stacked LSTM.
The formula for LSTM used here is as follows:
.. math::
i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
h_{t} & = o_{t} act_c (c_{t})
Parameters:
input_size (int): The input size for the first LSTM cell.
hidden_size (int): The hidden size for every LSTM cell.
gate_activation (function, optional): The activation function for gates
of LSTM, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
LSTM, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
forget_bias (float, optional): forget bias used when computing forget
gate. It also can accept a boolean value `True`, which would set
:math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
:math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
Default 1.0.
num_layers(int, optional): The number of LSTM to be stacked. Default 1.
dropout(float|list|tuple, optional): The dropout probability after each
LSTM. It also can be a list or tuple, including dropout probabilities
for the corresponding LSTM. Default 0.0
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
Default None.
bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
Default None.
dtype(string, optional): The data type used in this cell. It can be
float32 or float64. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import StackedLSTMCell, RNN
inputs = paddle.rand((2, 4, 32))
cell = StackedLSTMCell(input_size=32, hidden_size=64)
rnn = RNN(cell=cell)
outputs, _ = rnn(inputs) # [2, 4, 64]
"""
def __init__(self,
input_size,
hidden_size,
gate_activation=None,
activation=None,
forget_bias=1.0,
num_layers=1,
dropout=0.0,
param_attr=None,
bias_attr=None,
dtype="float32"):
super(StackedLSTMCell, self).__init__()
self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
float)
param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
self.cells = []
for i in range(num_layers):
if forget_bias is True:
bias_attrs[
i].initializer = fluid.initializer.NumpyArrayInitializer(
np.concatenate(
np.zeros(2 * hidden_size),
np.ones(hidden_size), np.zeros(hidden_size)).astype(
dtype))
forget_bias = 0.0
self.cells.append(
self.add_sublayer(
"lstm_%d" % i,
BasicLSTMCell(
input_size=input_size if i == 0 else hidden_size,
hidden_size=hidden_size,
gate_activation=gate_activation,
activation=activation,
forget_bias=forget_bias,
param_attr=param_attrs[i],
bias_attr=bias_attrs[i],
dtype=dtype)))
def forward(self, inputs, states):
"""
Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
the `outputs` of the previous cell. And each cell's `states` is the
corresponding one in `states`.
Parameters:
inputs (Variable): The inputs for the first cell. It is a float32 or
float64 tensor with shape `[batch_size, input_size]`.
states (list): A list containing states for all cells orderly.
**kwargs: Additional keyword arguments, which passed to `cell.forward`
for all including cells.
Returns:
tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
a tensor with shape `[batch_size, hidden_size]`, corresponding \
to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
is a list composed of every LSTM `new_states` which is a pair \
of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
and the data type and structure of these tensors all is same \
as that of `states`.
"""
new_states = []
for i, cell in enumerate(self.cells):
outputs, new_state = cell(inputs, states[i])
outputs = layers.dropout(
outputs,
self.dropout[i],
dropout_implementation='upscale_in_train') if self.dropout[
i] > 0 else outputs
inputs = outputs
new_states.append(new_state)
return outputs, new_states
@property
def state_shape(self):
"""
The `state_shape` of StackedLSTMCell is a list composed of each including
LSTM cell's `state_shape`.
Returns:
list: A list composed of each including LSTM cell's `state_shape`.
"""
return [cell.state_shape for cell in self.cells]
class LSTM(Layer):
"""
Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
sequence.
The formula for LSTM used here is as follows:
.. math::
i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
h_{t} & = o_{t} act_c (c_{t})
Parameters:
input_size (int): The input feature size for the first LSTM.
hidden_size (int): The hidden size for every LSTM.
gate_activation (function, optional): The activation function for gates
of LSTM, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
LSTM, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
forget_bias (float, optional): forget bias used when computing forget
gate. It also can accept a boolean value `True`, which would set
:math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
:math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
Default 1.0.
num_layers(int, optional): The number of LSTM to be stacked. Default 1.
dropout(float|list|tuple, optional): The dropout probability after each
LSTM. It also can be a list or tuple, including dropout probabilities
for the corresponding LSTM. Default 0.0
is_reverse (bool, optional): Indicate whether to calculate in the reverse
order of input sequences. Default: `False`.
time_major (bool, optional): Indicate the data layout of Tensor included
in `input` and `output` tensors. If `False`, the data layout would
be batch major with shape `[batch_size, sequence_length, ...]`. If
`True`, the data layout would be time major with shape
`[sequence_length, batch_size, ...]`. Default: `False`.
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
Default None.
bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
Default None.
dtype(string, optional): The data type used in this cell. It can be
float32 or float64. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import LSTM
inputs = paddle.rand((2, 4, 32))
lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
outputs, _ = lstm(inputs) # [2, 4, 64]
"""
def __init__(self,
input_size,
hidden_size,
gate_activation=None,
activation=None,
forget_bias=1.0,
num_layers=1,
dropout=0.0,
is_reverse=False,
time_major=False,
param_attr=None,
bias_attr=None,
dtype='float32'):
super(LSTM, self).__init__()
lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
activation, forget_bias, num_layers,
dropout, param_attr, bias_attr, dtype)
self.lstm = RNN(lstm_cell, is_reverse, time_major)
def forward(self, inputs, initial_states=None, sequence_length=None):
"""
Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
is the `inputs` of the subsequent one.
Parameters:
inputs (Variable): The inputs for the first LSTM. It is a float32
or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
initial_states (list|None, optional): A list containing initial states
of all stacked LSTM, and the initial states of each LSTM is a pair
of tensors shaped `[batch_size, hidden_size]`. If not provided,
use 0 as initial states. Default None.
sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
It stores real length of each instance, thus enables users to extract
the last valid state when past a batch element's sequence length for
correctness. If not provided, the paddings would be treated same as
non-padding inputs. Default None.
Returns:
tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
is the output of last LSTM and it is a tensor with shape \
`[batch_size, sequence_length, hidden_size]` and has the same \
data type as `inputs`, `final_states` is the counterpart of \
`initial_states` at last time step, thus has the same structure \
with it and has tensors with same shapes data types.
"""
return self.lstm(inputs, initial_states, sequence_length)
class BidirectionalRNN(Layer):
"""
Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
forward and backward RNN separately, and merge outputs of these two RNN
according to `merge_mode`.
Parameters:
cell_fw (RNNCell): A RNNCell instance used for forward RNN.
cell_bw (RNNCell): A RNNCell instance used for backward RNN.
merge_mode (str|None, optional): The way to merget outputs of forward and
backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
where None stands for make the two `outputs` as a tuple, `zip` stands
for make each two corresponding tensors of the two `outputs` as a tuple.
Default `concat`
Examples:
.. code-block:: python
import paddle
from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
inputs = paddle.rand((2, 4, 32))
cell_fw = StackedLSTMCell(32, 64)
cell_bw = StackedLSTMCell(32, 64)
bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
outputs, _ = bi_rnn(inputs) # [2, 4, 128]
"""
def __init__(self,
cell_fw,
cell_bw,
merge_mode='concat',
time_major=False,
cell_cls=None,
**kwargs):
super(BidirectionalRNN, self).__init__()
self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
if merge_mode == 'concat':
self.merge_func = lambda x, y: layers.concat([x, y], -1)
elif merge_mode == 'sum':
self.merge_func = lambda x, y: layers.elementwise_add(x, y)
elif merge_mode == 'ave':
self.merge_func = lambda x, y: layers.scale(
layers.elementwise_add(x, y), 0.5)
elif merge_mode == 'mul':
self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
elif merge_mode == 'zip':
self.merge_func = lambda x, y: (x, y)
elif merge_mode is None:
self.merge_func = None
else:
raise ValueError('Unsupported value for `merge_mode`: %s' %
merge_mode)
def forward(self,
inputs,
initial_states=None,
sequence_length=None,
**kwargs):
"""
Performs forward and backward RNN separately, and merge outputs of these
two RNN according to `merge_mode`.
Parameters:
inputs (Variable): A (possibly nested structure of) tensor variable[s].
The shape of tensor should be `[batch_size, sequence_length, ...]`
for `time_major == False` or `[sequence_length, batch_size, ...]`
for `time_major == True`. It represents the inputs to be unrolled
in both forward and backward RNN.
initial_states (Variable|list|tuple): If it is a list or tuple, its
length should be 2 to include initial states of forward and backward
RNN separately. Otherwise it would be used twice for the two RNN.
If None, `cell.get_initial_states` would be used to produce the initial
states. Default None.
sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
It stores real length of each instance, thus enables users to extract
the last valid state when past a batch element's sequence length for
correctness. If not provided, the paddings would be treated same as
non-padding inputs. Default None.
**kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
Returns:
tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
is produced by merge outputs of forward and backward RNN according \
to `merge_mode`, `final_states` is a pair including `final_states` \
of forward and backward RNN.
"""
if isinstance(initial_states, (list, tuple)):
assert len(
initial_states
) == 2, "length of initial_states should be 2 when it is a list/tuple"
else:
initial_states = [initial_states, initial_states]
outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
sequence_length, **kwargs)
outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
sequence_length, **kwargs)
outputs = map_structure(self.merge_func, outputs_fw,
outputs_bw) if self.merge_func else (outputs_fw,
outputs_bw)
return outputs, (states_fw, states_bw)
@staticmethod
def bidirect_param_attr(param_attr):
"""
Converts `param_attr` to a pair of `param_attr` when it is not a list
or tuple with length 2, also rename every one by appending a suffix to
avoid having same names when `param_attr` contains a name.
Parameters:
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. When
it is a list or tuple, its length must be 2.
Returns:
list: A pair composed of forward and backward RNN cell's `param_attr`.
"""
if isinstance(param_attr, (list, tuple)):
assert len(
param_attr
) == 2, "length of param_attr should be 2 when it is a list/tuple"
param_attrs = param_attr
else:
param_attrs = []
attr = fluid.ParamAttr._to_attr(param_attr)
attr_fw = copy.deepcopy(attr)
if attr.name:
attr_fw.name = attr_fw.name + "_fw"
param_attrs.append(attr_fw)
attr_bw = copy.deepcopy(attr)
if attr.name:
attr_bw.name = attr_bw.name + "_bw"
param_attrs.append(attr_bw)
return param_attrs
class BidirectionalLSTM(Layer):
"""
Applies a bidirectional multi-layer long short-term memory (LSTM) RNN to an
input sequence.
Bidirection interaction can happen after each layer or only after the last
layer according to the `merge_each_layer` setting. The way to interact,
that is how to merge outputs of the two direction, is determined by `merge_mode`.
The formula for LSTM used here is as follows:
.. math::
i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
h_{t} & = o_{t} act_c (c_{t})
Parameters:
input_size (int): The input feature size for the first LSTM.
hidden_size (int): The hidden size for every LSTM.
gate_activation (function, optional): The activation function for gates
of LSTM, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
LSTM, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
forget_bias (float, optional): forget bias used when computing forget
gate. It also can accept a boolean value `True`, which would set
:math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
:math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
Default 1.0.
num_layers(int, optional): The number of LSTM to be stacked. Default 1.
dropout(float|list|tuple, optional): The dropout probability after each
LSTM. It also can be a list or tuple, including dropout probabilities
for the corresponding LSTM. Default 0.0
merge_mode (str|None, optional): The way to merget outputs of forward and
backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
where None stands for make the two `outputs` as a tuple, `zip` stands
for make each two corresponding tensors of the two `outputs` as a tuple.
Default `concat`
merge_each_layer (bool, optional): Indicate whether bidirection interaction
happens after each layer or only after the last layer. Default: `False`.
time_major (bool, optional): Indicate the data layout of Tensor included
in `input` and `output` tensors. If `False`, the data layout would
be batch major with shape `[batch_size, sequence_length, ...]`. If
`True`, the data layout would be time major with shape
`[sequence_length, batch_size, ...]`. Default: `False`.
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
Default None.
bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
Default None.
dtype(string, optional): The data type used in this cell. It can be
float32 or float64. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import BidirectionalLSTM
inputs = paddle.rand((2, 4, 32))
bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2)
outputs, _ = bi_lstm(inputs) # [2, 4, 128]
"""
def __init__(self,
input_size,
hidden_size,
gate_activation=None,
activation=None,
forget_bias=1.0,
num_layers=1,
dropout=0.0,
merge_mode='concat',
merge_each_layer=False,
time_major=False,
param_attr=None,
bias_attr=None,
dtype='float32'):
super(BidirectionalLSTM, self).__init__()
self.num_layers = num_layers
self.merge_mode = merge_mode
self.merge_each_layer = merge_each_layer
param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
if not merge_each_layer:
cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
activation, forget_bias, num_layers,
dropout, param_attrs[0], bias_attrs[0],
dtype)
cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
activation, forget_bias, num_layers,
dropout, param_attrs[1], bias_attrs[1],
dtype)
self.lstm = BidirectionalRNN(
cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
else:
fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
num_layers)
bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
num_layers)
fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
num_layers)
bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
num_layers)
# maybe design cell including both forward and backward later
self.lstm = []
for i in range(num_layers):
cell_fw = StackedLSTMCell(
input_size
if i == 0 else (hidden_size * 2
if merge_mode == 'concat' else hidden_size),
hidden_size, gate_activation, activation, forget_bias, 1,
dropout, fw_param_attrs[i], fw_bias_attrs[i], dtype)
cell_bw = StackedLSTMCell(
input_size
if i == 0 else (hidden_size * 2
if merge_mode == 'concat' else hidden_size),
hidden_size, gate_activation, activation, forget_bias, 1,
dropout, bw_param_attrs[i], bw_bias_attrs[i], dtype)
self.lstm.append(
self.add_sublayer(
"lstm_%d" % i,
BidirectionalRNN(
cell_fw,
cell_bw,
merge_mode=merge_mode,
time_major=time_major)))
def forward(self, inputs, initial_states=None, sequence_length=None):
"""
Performs bidirectional multi-layer LSTM layer by layer. Each LSTM's `outputs`
is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
merged outputs would be the `inputs` of the subsequent one.
Parameters:
inputs (Variable): The inputs for the first LSTM. It is a float32
or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
initial_states (list|None, optional): A list containing initial states
of all stacked LSTM. If `merge_each_layer` is True, the length of
list should be `num_layers` and a single value would be reused for
`num_layers`; Otherwise, the length should be 2 and a single value
would be reused twice. If not provided, use 0 as initial states.
Default None.
sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
It stores real length of each instance, thus enables users to extract
the last valid state when past a batch element's sequence length for
correctness. If not provided, the paddings would be treated same as
non-padding inputs. Default None.
Returns:
tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
is the output of last bidirectional LSTM; `final_states` is a \
pair including `final_states` of forward and backward LSTM when \
`merge_each_layer` is False or a list including `final_states` \
of all stacked bidirectional LSTM, and it has tensors with same \
shapes data types as `initial_states`.
"""
if not self.merge_each_layer:
return self.lstm(inputs, initial_states, sequence_length)
else:
if isinstance(initial_states, (list, tuple)):
assert len(initial_states) == self.num_layers, (
"length of initial_states should be %d when it is a list/tuple"
% self.num_layers)
else:
initial_states = [initial_states] * self.num_layers
stacked_states = []
for i in range(self.num_layers):
outputs, states = self.lstm[i](inputs, initial_states[i],
sequence_length)
inputs = outputs
stacked_states.append(states)
return outputs, stacked_states
class StackedGRUCell(RNNCell):
"""
Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
to implement stacked GRU.
The formula for GRU used here is as follows:
.. math::
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
Parameters:
input_size (int): The input size for the first GRU cell.
hidden_size (int): The hidden size for every GRU cell.
gate_activation (function, optional): The activation function for gates
of GRU, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
GRU, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
num_layers(int, optional): The number of LSTM to be stacked. Default 1.
dropout(float|list|tuple, optional): The dropout probability after each
GRU. It also can be a list or tuple, including dropout probabilities
for the corresponding GRU. Default 0.0
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
Default None.
bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
Default None.
dtype(string, optional): The data type used in this cell. It can be
float32 or float64. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import StackedGRUCell, RNN
inputs = paddle.rand((2, 4, 32))
cell = StackedGRUCell(input_size=32, hidden_size=64)
rnn = RNN(cell=cell)
outputs, _ = rnn(inputs) # [2, 4, 64]
"""
def __init__(self,
input_size,
hidden_size,
gate_activation=None,
activation=None,
num_layers=1,
dropout=0.0,
param_attr=None,
bias_attr=None,
dtype="float32"):
super(StackedGRUCell, self).__init__()
self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
float)
param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
self.cells = []
for i in range(num_layers):
self.cells.append(
self.add_sublayer(
"gru_%d" % i,
BasicGRUCell(
input_size=input_size if i == 0 else hidden_size,
hidden_size=hidden_size,
gate_activation=gate_activation,
activation=activation,
param_attr=param_attrs[i],
bias_attr=bias_attrs[i],
dtype=dtype)))
def forward(self, inputs, states):
"""
Performs the stacked GRU cells sequentially. Each cell's `inputs` is
the `outputs` of the previous cell. And each cell's `states` is the
corresponding one in `states`.
Parameters:
inputs (Variable): The inputs for the first cell. It is a float32 or
float64 tensor with shape `[batch_size, input_size]`.
states (list): A list containing states for all cells orderly.
**kwargs: Additional keyword arguments, which passed to `cell.forward`
for all including cells.
Returns:
tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
a tensor with shape `[batch_size, hidden_size]`, corresponding \
to :math:`h_{t}` in the formula of the last GRU; `new_states` \
is a list composed of every GRU `new_states` which is also \
:math:`h_{t}` in the formula, and the data type and structure \
of these tensors all is same as that of `states`.
"""
new_states = []
for i, cell in enumerate(self.cells):
outputs, new_state = cell(inputs, states[i])
outputs = layers.dropout(
outputs,
self.dropout[i],
dropout_implementation='upscale_in_train') if self.dropout[
i] > 0 else outputs
inputs = outputs
new_states.append(new_state)
return outputs, new_states
@property
def state_shape(self):
"""
The `state_shape` of StackedGRUCell is a list composed of each including
GRU cell's `state_shape`.
Returns:
list: A list composed of each including GRU cell's `state_shape`.
"""
return [cell.state_shape for cell in self.cells]
class GRU(Layer):
"""
Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
sequence.
The formula for GRU used here is as follows:
.. math::
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
Parameters:
input_size (int): The input feature size for the first GRU cell.
hidden_size (int): The hidden size for every GRU cell.
gate_activation (function, optional): The activation function for gates
of GRU, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
GRU, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
num_layers(int, optional): The number of GRU to be stacked. Default 1.
dropout(float|list|tuple, optional): The dropout probability after each
GRU. It also can be a list or tuple, including dropout probabilities
for the corresponding GRU. Default 0.0
is_reverse (bool, optional): Indicate whether to calculate in the reverse
order of input sequences. Default: `False`.
time_major (bool, optional): Indicate the data layout of Tensor included
in `input` and `output` tensors. If `False`, the data layout would
be batch major with shape `[batch_size, sequence_length, ...]`. If
`True`, the data layout would be time major with shape
`[sequence_length, batch_size, ...]`. Default: `False`.
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
Default None.
bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
Default None.
dtype(string, optional): The data type used in this cell. It can be
float32 or float64. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import GRU
inputs = paddle.rand((2, 4, 32))
gru = GRU(input_size=32, hidden_size=64, num_layers=2)
outputs, _ = gru(inputs) # [2, 4, 64]
"""
def __init__(self,
input_size,
hidden_size,
gate_activation=None,
activation=None,
num_layers=1,
dropout=0.0,
is_reverse=False,
time_major=False,
param_attr=None,
bias_attr=None,
dtype='float32'):
super(GRU, self).__init__()
gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
activation, num_layers, dropout, param_attr,
bias_attr, dtype)
self.gru = RNN(gru_cell, is_reverse, time_major)
def forward(self, inputs, initial_states=None, sequence_length=None):
"""
Performs the stacked multi-layer GRU layer by layer. Each GRU's `outputs`
is the `inputs` of the subsequent one.
Parameters:
inputs (Variable): The inputs for the first GRU. It is a float32
or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
initial_states (list|None, optional): A list containing initial states
of all stacked GRU, and the initial states of each GRU is a tensor
shaped `[batch_size, hidden_size]`. If not provided, use 0 as initial
states. Default None.
sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
It stores real length of each instance, thus enables users to extract
the last valid state when past a batch element's sequence length for
correctness. If not provided, the paddings would be treated same as
non-padding inputs. Default None.
Returns:
tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
is the output of last GRU and it is a tensor with shape \
`[batch_size, sequence_length, hidden_size]` and has the same \
data type as `inputs`, `final_states` is the counterpart of \
`initial_states` at last time step, thus has the same structure \
with it and has tensors with same shapes data types.
"""
return self.gru(inputs, initial_states, sequence_length)
class BidirectionalGRU(Layer):
"""
Applies a bidirectional multi-layer gated recurrent unit (GRU) RNN to an input
sequence.
Bidirection interaction can happen after each layer or only after the last
layer according to the `merge_each_layer` setting. The way to interact,
that is how to merge outputs of the two direction, is determined by `merge_mode`.
The formula for GRU used here is as follows:
.. math::
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
Parameters:
input_size (int): The input feature size for the first GRU cell.
hidden_size (int): The hidden size for every GRU cell.
gate_activation (function, optional): The activation function for gates
of GRU, that is :math:`act_g` in the formula. Default: None,
representing for `fluid.layers.sigmoid`.
activation (function, optional): The non-gate activation function of
GRU, that is :math:`act_c` in the formula. Default: None,
representing for 'fluid.layers.tanh'.
num_layers(int, optional): The number of GRU to be stacked. Default 1.
dropout(float|list|tuple, optional): The dropout probability after each
GRU. It also can be a list or tuple, including dropout probabilities
for the corresponding GRU. Default 0.0
merge_mode (str|None, optional): The way to merget outputs of forward and
backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
where None stands for make the two `outputs` as a tuple, `zip` stands
for make each two corresponding tensors of the two `outputs` as a tuple.
Default `concat`
merge_each_layer (bool, optional): Indicate whether bidirection interaction
happens after each layer or only after the last layer. Default: `False`.
time_major (bool, optional): Indicate the data layout of Tensor included
in `input` and `output` tensors. If `False`, the data layout would
be batch major with shape `[batch_size, sequence_length, ...]`. If
`True`, the data layout would be time major with shape
`[sequence_length, batch_size, ...]`. Default: `False`.
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
Default None.
bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
Default None.
dtype(string, optional): The data type used in this cell. It can be
float32 or float64. Default float32.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import BidirectionalGRU
inputs = paddle.rand((2, 4, 32))
bi_gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
outputs, _ = bi_gru(inputs) # [2, 4, 128]
"""
def __init__(self,
input_size,
hidden_size,
gate_activation=None,
activation=None,
forget_bias=1.0,
num_layers=1,
dropout=0.0,
merge_mode='concat',
merge_each_layer=False,
time_major=False,
param_attr=None,
bias_attr=None,
dtype='float32'):
super(BidirectionalGRU, self).__init__()
self.num_layers = num_layers
self.merge_mode = merge_mode
self.merge_each_layer = merge_each_layer
param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
if not merge_each_layer:
cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
activation, num_layers, dropout,
param_attrs[0], bias_attrs[0], dtype)
cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
activation, num_layers, dropout,
param_attrs[1], bias_attrs[1], dtype)
self.gru = BidirectionalRNN(
cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
else:
fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
num_layers)
bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
num_layers)
fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
num_layers)
bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
num_layers)
# maybe design cell including both forward and backward later
self.gru = []
for i in range(num_layers):
cell_fw = StackedGRUCell(input_size if i == 0 else (
hidden_size * 2 if merge_mode == 'concat' else
hidden_size), hidden_size, gate_activation, activation, 1,
dropout, fw_param_attrs[i],
fw_bias_attrs[i], dtype)
cell_bw = StackedGRUCell(input_size if i == 0 else (
hidden_size * 2 if merge_mode == 'concat' else
hidden_size), hidden_size, gate_activation, activation, 1,
dropout, bw_param_attrs[i],
bw_bias_attrs[i], dtype)
self.gru.append(
self.add_sublayer(
"gru_%d" % i,
BidirectionalRNN(
cell_fw,
cell_bw,
merge_mode=merge_mode,
time_major=time_major)))
def forward(self, inputs, initial_states=None, sequence_length=None):
"""
Performs bidirectional multi-layer GRU layer by layer. Each GRU's `outputs`
is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
merged outputs would be the `inputs` of the subsequent one.
Parameters:
inputs (Variable): The inputs for the first GRU. It is a float32
or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
initial_states (list|None, optional): A list containing initial states
of all stacked GRU. If `merge_each_layer` is True, the length of
list should be `num_layers` and a single value would be reused for
`num_layers`; Otherwise, the length should be 2 and a single value
would be reused twice. If not provided, use 0 as initial states.
Default None.
sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
It stores real length of each instance, thus enables users to extract
the last valid state when past a batch element's sequence length for
correctness. If not provided, the paddings would be treated same as
non-padding inputs. Default None.
Returns:
tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
is the output of last bidirectional GRU; `final_states` is a \
pair including `final_states` of forward and backward GRU when \
`merge_each_layer` is False or a list including `final_states` \
of all stacked bidirectional GRU, and it has tensors with same \
shapes data types as `initial_states`.
"""
if not self.merge_each_layer:
return self.gru(inputs, initial_states, sequence_length)
else:
if isinstance(initial_states, (list, tuple)):
assert len(initial_states) == self.num_layers, (
"length of initial_states should be %d when it is a list/tuple"
% self.num_layers)
else:
initial_states = [initial_states] * self.num_layers
stacked_states = []
for i in range(self.num_layers):
outputs, states = self.gru[i](inputs, initial_states[i],
sequence_length)
inputs = outputs
stacked_states.append(states)
return outputs, stacked_states
class DynamicDecode(Layer):
"""
DynamicDecode integrates an Decoder instance to perform dynamic decoding.
It performs :code:`decoder.step()` repeatedly until the returned Tensor
indicating finished status contains all True values or the number of
decoding step reaches to :attr:`max_step_num`.
:code:`decoder.initialize()` would be called once before the decoding loop.
If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
would be called once after the decoding loop.
Parameters:
decoder (Decoder): An instance of `Decoder`.
max_step_num (int, optional): The maximum number of steps. If not provided,
decode until the decoder is fully done, or in other words, the returned
Tensor by :code:`decoder.step()` indicating finished status contains
all True. Default `None`.
output_time_major (bool, optional): Indicate the data layout of Tensor included
in the final outputs(the first returned value of this method). If
attr:`False`, the data layout would be batch major with shape
`[batch_size, seq_len, ...]`. If attr:`True`, the data layout would
be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
impute_finished (bool, optional): If `True`, then states get copied through
for batch entries which are marked as finished, which differs with the
unfinished using the new states returned by :code:`decoder.step()` and
ensures that the final states have the correct values. Otherwise, states
wouldn't be copied through when finished. If the returned `final_states`
is needed, it should be set as True, which causes some slowdown.
Default `False`.
is_test (bool, optional): A flag indicating whether to use test mode. In
test mode, it is more memory saving. Default `False`.
return_length (bool, optional): A flag indicating whether to return an
extra Tensor variable in the output tuple, which stores the actual
lengths of all decoded sequences. Default `False`.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers import BeamSearchDecoder
from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
paddle.enable_dygraph()
vocab_size, d_model, = 100, 32
encoder_output = paddle.rand((2, 4, d_model))
trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])
output_layer = fluid.dygraph.Linear(d_model, vocab_size)
cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model)
decoder = BeamSearchDecoder(cell,
start_token=0,
end_token=1,
beam_size=4,
embedding_fn=trg_embeder,
output_fn=output_layer)
dynamic_decoder = DynamicDecode(decoder, max_step_num=10)
outputs = dynamic_decoder(cell.get_initial_states(encoder_output))
"""
def __init__(self,
decoder,
max_step_num=None,
output_time_major=False,
impute_finished=False,
is_test=False,
return_length=False):
super(DynamicDecode, self).__init__()
self.decoder = decoder
self.max_step_num = max_step_num
self.output_time_major = output_time_major
self.impute_finished = impute_finished
self.is_test = is_test
self.return_length = return_length
def forward(self, inits=None, **kwargs):
"""
Performs :code:`decoder.step()` repeatedly until the returned Tensor
indicating finished status contains all True values or the number of
decoding step reaches to :attr:`max_step_num`.
:code:`decoder.initialize()` would be called once before the decoding loop.
If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
would be called once after the decoding loop.
Parameters:
inits (object, optional): Argument passed to `decoder.initialize`.
Default `None`.
**kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
Returns:
tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
The final outputs and states, both are Tensor or nested structure of Tensor. \
`final_outputs` has the same structure and data types as the :code:`outputs` \
returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
is the stacked of all decoding steps' outputs, which might be revised \
by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
`final_states` is the counterpart at last time step of initial states \
returned by :code:`decoder.initialize()` , thus has the same structure \
with it and has tensors with same shapes and data types. `sequence_lengths` \
is an `int64` tensor with the same shape as `finished` returned \
by :code:`decoder.initialize()` , and it stores the actual lengths of \
all decoded sequences.
"""
if fluid.in_dygraph_mode():
class ArrayWrapper(object):
def __init__(self, x):
self.array = [x]
def append(self, x):
self.array.append(x)
return self
def __getitem__(self, item):
return self.array.__getitem__(item)
def _maybe_copy(state, new_state, step_mask):
# TODO: use where_op
state_dtype = state.dtype
if convert_dtype(state_dtype) in ["bool"]:
state = layers.cast(state, dtype="float32")
new_state = layers.cast(new_state, dtype="float32")
if step_mask.dtype != state.dtype:
step_mask = layers.cast(step_mask, dtype=state.dtype)
# otherwise, renamed bool gradients of would be summed up leading
# to sum(bool) error.
step_mask.stop_gradient = True
new_state = layers.elementwise_mul(
state, step_mask, axis=0) - layers.elementwise_mul(
new_state, (step_mask - 1), axis=0)
if convert_dtype(state_dtype) in ["bool"]:
new_state = layers.cast(new_state, dtype=state_dtype)
return new_state
initial_inputs, initial_states, initial_finished = self.decoder.initialize(
inits)
inputs, states, finished = (initial_inputs, initial_states,
initial_finished)
cond = layers.logical_not((layers.reduce_all(initial_finished)))
sequence_lengths = layers.cast(
layers.zeros_like(initial_finished), "int64")
outputs = None
step_idx = 0
step_idx_tensor = layers.fill_constant(
shape=[1], dtype="int64", value=step_idx)
while cond.numpy():
(step_outputs, next_states, next_inputs,
next_finished) = self.decoder.step(step_idx_tensor, inputs,
states, **kwargs)
if not self.decoder.tracks_own_finished:
# BeamSearchDecoder would track it own finished, since
# beams would be reordered and the finished status of each
# entry might change. Otherwise, perform logical OR which
# would not change the already finished.
next_finished = layers.logical_or(next_finished, finished)
# To confirm states.finished/finished be consistent with
# next_finished.
layers.assign(next_finished, finished)
next_sequence_lengths = layers.elementwise_add(
sequence_lengths,
layers.cast(
layers.logical_not(finished), sequence_lengths.dtype))
if self.impute_finished: # rectify the states for the finished.
next_states = map_structure(
lambda x, y: _maybe_copy(x, y, finished), states,
next_states)
outputs = map_structure(
lambda x: ArrayWrapper(x),
step_outputs) if step_idx == 0 else map_structure(
lambda x, x_array: x_array.append(x), step_outputs,
outputs)
inputs, states, finished, sequence_lengths = (
next_inputs, next_states, next_finished,
next_sequence_lengths)
layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
step_idx += 1
layers.logical_not(layers.reduce_all(finished), cond)
if self.max_step_num is not None and step_idx > self.max_step_num:
break
final_outputs = map_structure(
lambda x: fluid.layers.stack(x.array, axis=0), outputs)
final_states = states
try:
final_outputs, final_states = self.decoder.finalize(
final_outputs, final_states, sequence_lengths)
except NotImplementedError:
pass
if not self.output_time_major:
final_outputs = map_structure(
lambda x: layers.transpose(x, [1, 0] + list(
range(2, len(x.shape)))), final_outputs)
return (final_outputs, final_states,
sequence_lengths) if self.return_length else (final_outputs,
final_states)
else:
return fluid.layers.dynamic_decode(
self.decoder,
inits,
max_step_num=self.max_step_num,
output_time_major=self.output_time_major,
impute_finished=self.impute_finished,
is_test=self.is_test,
return_length=self.return_length,
**kwargs)
class Conv1dPoolLayer(Layer):
"""
This interface is used to construct a callable object of the ``Conv1DPoolLayer``
class. The ``Conv1DPoolLayer`` class does a ``Conv1D`` and a ``Pool1D`` .
For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates
the output based on the input, filter and strides, paddings, dilations, groups,
global_pooling, pool_type, ceil_mode, exclusive parameters.
Parameters:
num_channels (int): The number of channels in the input data.
num_filters(int): The number of filters. It is the same as the output channels.
filter_size (int): The filter size of Conv1DPoolLayer.
pool_size (int): The pooling size of Conv1DPoolLayer.
conv_stride (int): The stride size of the conv Layer in Conv1DPoolLayer.
Default: 1
pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer.
Default: 1
conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer.
Default: 0
pool_padding (int): The padding of pool layer in Conv1DPoolLayer.
Default: 0
act (str): Activation type for conv layer, if it is set to None, activation
is not appended. Default: None.
pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
average-pooling. Default: `max`
dilation (int): The dilation size of the conv Layer. Default: 1.
groups (int): The groups number of the conv Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the
first half of the filters is only connected to the first half of the
input channels, while the second half of the filters is only connected
to the second half of the input channels. Default: 1.
global_pooling (bool): Whether to use the global pooling. If it is true,
`pool_size` and `pool_padding` would be ignored. Default: False
ceil_mode (bool, optional): Whether to use the ceil function to calculate output
height and width.False is the default. If it is set to False, the floor function
will be used. Default: False.
exclusive (bool, optional): Whether to exclude padding points in average pooling mode.
Default: True.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: False
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
Example:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import Conv1dPoolLayer
# input: [batch_size, num_channels, sequence_length]
input = paddle.rand((2, 32, 4))
cov2d = Conv1dPoolLayer(num_channels=32,
num_filters=64,
filter_size=2,
pool_size=2)
output = cov2d(input)
"""
def __init__(self,
num_channels,
num_filters,
filter_size,
pool_size,
conv_stride=1,
pool_stride=1,
conv_padding=0,
pool_padding=0,
act=None,
pool_type='max',
global_pooling=False,
dilation=1,
groups=None,
ceil_mode=False,
exclusive=True,
use_cudnn=False,
param_attr=None,
bias_attr=None):
super(Conv1dPoolLayer, self).__init__()
self._conv2d = Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=[filter_size, 1],
stride=[conv_stride, 1],
padding=[conv_padding, 0],
dilation=[dilation, 1],
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act)
self._pool2d = Pool2D(
pool_size=[pool_size, 1],
pool_type=pool_type,
pool_stride=[pool_stride, 1],
pool_padding=[pool_padding, 0],
global_pooling=global_pooling,
use_cudnn=use_cudnn,
ceil_mode=ceil_mode,
exclusive=exclusive)
def forward(self, input):
"""
Performs conv1d and pool1d on the input.
Parameters:
input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
representing `batch_size`, `num_channels` and `sequence_length`
separately. data type can be float32 or float64
Returns:
Variable: The 3-D output tensor after conv and pool. It has the same \
data type as input.
"""
x = fluid.layers.unsqueeze(input, axes=[-1])
x = self._conv2d(x)
x = self._pool2d(x)
x = fluid.layers.squeeze(x, axes=[-1])
return x
class CNNEncoder(Layer):
"""
This interface is used to construct a callable object of the ``CNNEncoder``
class. The ``CNNEncoder`` is composed of multiple ``Conv1dPoolLayer`` .
``CNNEncoder`` can define every Conv1dPoolLayer with different or same parameters.
The ``Conv1dPoolLayer`` in ``CNNEncoder`` is parallel. The results of every
``Conv1dPoolLayer`` will concat at the channel dimension as the final output.
Parameters:
num_channels(int|list|tuple): The number of channels in the input data. If
`num_channels` is a list or tuple, the length of `num_channels` must
equal to `num_layers`. If `num_channels` is a int, all conv1dpoollayer's
`num_channels` are the value of `num_channels`.
num_filters(int|list|tuple): The number of filters. It is the same as the
output channels. If `num_filters` is a list or tuple, the length of
`num_filters` must equal `num_layers`. If `num_filters` is a int,
all conv1dpoollayer's `num_filters` are the value of `num_filters`.
filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder.
If `filter_size` is a list or tuple, the length of `filter_size` must
equal `num_layers`. If `filter_size` is a int, all conv1dpoollayer's
`filter_size` are the value of `filter_size`.
pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.
If `pool_size` is a list or tuple, the length of `pool_size` must equal
`num_layers`. If `pool_size` is a int, all conv1dpoollayer's `pool_size`
are the value of `pool_size`.
num_layers(int): The number of conv1dpoolLayer used in CNNEncoder.
conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer.
If `conv_stride` is a list or tuple, the length of `conv_stride` must
equal `num_layers`. If conv_stride is a int, all conv1dpoollayer's `conv_stride`
are the value of `conv_stride`. Default: 1
pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer.
If `pool_stride` is a list or tuple, the length of `pool_stride` must
equal `num_layers`. If `pool_stride` is a int, all conv1dpoollayer's `pool_stride`
are the value of `pool_stride`. Default: 1
conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer.
If `conv_padding` is a list or tuple, the length of `conv_padding` must
equal `num_layers`. If `conv_padding` is a int, all conv1dpoollayer's `conv_padding`
are the value of `conv_padding`. Default: 0
pool_padding(int|list|tuple): The padding size of pool layer in Conv1DPoolLayer.
If `pool_padding` is a list or tuple, the length of `pool_padding` must
equal `num_layers`.If `pool_padding` is a int, all conv1dpoollayer's `pool_padding`
are the value of `pool_padding`. Default: 0
act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None,
activation is not appended. Default: None.
pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
average-pooling. Default: `max`
global_pooling (bool): Whether to use the global pooling. If it is true,
`pool_size` and `pool_padding` would be ignored. Default: False
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: False
Example:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import CNNEncoder
# input: [batch_size, num_channels, sequence_length]
input = paddle.rand((2, 32, 8))
cov_encoder = CNNEncoder(num_layers=2,
num_channels=32,
num_filters=64,
filter_size=[2, 3],
pool_size=[7, 6])
output = cov_encoder(input) # [2, 128, 1]
"""
def __init__(self,
num_channels,
num_filters,
filter_size,
pool_size,
num_layers=1,
conv_stride=1,
pool_stride=1,
conv_padding=0,
pool_padding=0,
act=None,
pool_type='max',
global_pooling=False,
use_cudnn=False):
super(CNNEncoder, self).__init__()
self.num_layers = num_layers
self.num_channels = num_channels
self.num_filters = num_filters
self.filter_size = filter_size
self.pool_size = pool_size
self.conv_stride = conv_stride
self.pool_stride = pool_stride
self.conv_padding = conv_padding
self.pool_padding = pool_padding
self.use_cudnn = use_cudnn
self.act = act
self.pool_type = pool_type
self.global_pooling = global_pooling
self.conv1d_pool_layers = fluid.dygraph.LayerList([
Conv1dPoolLayer(
num_channels=self.num_channels
if isinstance(self.num_channels, int) else self.num_channels[i],
num_filters=self.num_filters
if isinstance(self.num_channels, int) else self.num_filters[i],
filter_size=self.filter_size
if isinstance(self.filter_size, int) else self.filter_size[i],
pool_size=self.pool_size
if isinstance(self.pool_size, int) else self.pool_size[i],
conv_stride=self.conv_stride
if isinstance(self.conv_stride, int) else self.conv_stride[i],
pool_stride=self.pool_stride
if isinstance(self.pool_stride, int) else self.pool_stride[i],
conv_padding=self.conv_padding
if isinstance(self.conv_padding, int) else self.conv_padding[i],
pool_padding=self.pool_padding
if isinstance(self.pool_padding, int) else self.pool_padding[i],
act=self.act[i]
if isinstance(self.act, (list, tuple)) else self.act,
pool_type=self.pool_type,
global_pooling=self.global_pooling,
use_cudnn=self.use_cudnn) for i in range(num_layers)
])
def forward(self, input):
"""
Performs multiple parallel conv1d and pool1d, and concat the results of
them at the channel dimension to produce the final output.
Parameters:
input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
representing `batch_size`, `num_channels` and `sequence_length`
separately. data type can be float32 or float64
Returns:
Variable: The 3-D output tensor produced by concatenating results of \
all Conv1dPoolLayer. It has the same data type as input.
"""
res = [
conv1d_pool_layer(input)
for conv1d_pool_layer in self.conv1d_pool_layers
]
out = fluid.layers.concat(input=res, axis=1)
return out
class TransformerCell(RNNCell):
"""
TransformerCell wraps a Transformer decoder producing logits from `inputs`
composed by ids and position.
Parameters:
decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
includes a embedding layer accepting ids and positions instead of embeddings
and includes a output layer transforming decoder output features to logits.
embedding_fn(function, optional): A callable that accepts ids and position
as arguments and return embeddings as input of `decoder`. It can be
None if `decoder` includes a embedding layer. Default None.
output_fn(callable, optional): A callable applid on `decoder` output to
transform decoder output features to get logits. Mostly it is a Linear
layer with vocabulary size. It can be None if `decoder` includes a
output layer. Default None.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, Linear
from paddle.incubate.hapi.text import TransformerDecoder
from paddle.incubate.hapi.text import TransformerCell
from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
from paddle.incubate.hapi.text import DynamicDecode
paddle.enable_dygraph()
class Embedder(fluid.dygraph.Layer):
def __init__(self):
super(Embedder, self).__init__()
self.word_embedder = Embedding(size=[1000, 128])
self.pos_embedder = Embedding(size=[500, 128])
def forward(self, word, position):
return self.word_embedder(word) + self.pos_embedder(position)
embedder = Embedder()
output_layer = Linear(128, 1000)
decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
transformer_cell = TransformerCell(decoder, embedder, output_layer)
dynamic_decoder = DynamicDecode(
TransformerBeamSearchDecoder(
transformer_cell,
start_token=0,
end_token=1,
beam_size=4,
var_dim_in_state=2),
max_step_num=10,
is_test=True)
enc_output = paddle.rand((2, 4, 128))
# cross attention bias: [batch_size, n_head, trg_len, src_len]
trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
# inputs for beam search on Transformer
caches = transformer_cell.get_initial_states(enc_output)
enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
enc_output, beam_size=4)
trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
trg_src_attn_bias, beam_size=4)
static_caches = decoder.prepare_static_cache(enc_output)
outputs = dynamic_decoder(
inits=caches,
enc_output=enc_output,
trg_src_attn_bias=trg_src_attn_bias,
static_caches=static_caches)
"""
def __init__(self, decoder, embedding_fn=None, output_fn=None):
super(TransformerCell, self).__init__()
self.decoder = decoder
self.embedding_fn = embedding_fn
self.output_fn = output_fn
def forward(self,
inputs,
states=None,
enc_output=None,
trg_slf_attn_bias=None,
trg_src_attn_bias=None,
static_caches=[]):
"""
Produces logits from `inputs` composed by ids and positions.
Parameters:
inputs(tuple): A tuple includes target ids and positions. The two
tensors both have int64 data type and with 2D shape
`[batch_size, sequence_length]` where `sequence_length` is 1
for inference.
states(list): It caches the multi-head attention intermediate results
of history decoding steps. It is a list of dict where the length
of list is decoder layer number, and each dict has `k` and `v` as
keys and values are cached results. Default None
enc_output(Variable): The output of Transformer encoder. It is a tensor
with shape `[batch_size, sequence_length, d_model]`. The data type
should be float32 or float64.
trg_slf_attn_bias(Variable, optional): A tensor used in decoder self
attention to mask out attention on unwanted target positions. It
is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. It can be None when nothing wanted or needed to
be masked out. It can be None for inference. The data type should
be float32 or float64. Default None
trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder
cross attention to mask out unwanted attention on source (encoder output).
It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. It can be None when nothing wanted or needed to
be masked out. The data type should be float32 or float64. Default None
static_caches(list): It stores projected results of encoder output
to be used as keys and values in decoder-encoder cross attention
It is a list of dict where the length of list is decoder layer
number, and each dict has `static_k` and `static_v` as keys and
values are stored results. Default empty list
Returns:
tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
is a float32 or float64 3D tensor representing logits shaped \
`[batch_size, sequence_length, vocab_size]`. `new_states has \
the same structure and data type with `states` while the length \
is one larger since the intermediate results of current step are \
concatenated into it.
"""
trg_word, trg_pos = inputs
if states and static_caches:
for cache, static_cache in zip(states, static_caches):
cache.update(static_cache)
if self.embedding_fn is not None:
dec_input = self.embedding_fn(trg_word, trg_pos)
outputs = self.decoder(dec_input, enc_output, None,
trg_src_attn_bias, states)
else:
outputs = self.decoder(trg_word, trg_pos, enc_output, None,
trg_src_attn_bias, states)
if self.output_fn is not None:
outputs = self.output_fn(outputs)
new_states = [{
"k": cache["k"],
"v": cache["v"]
} for cache in states] if states else states
return outputs, new_states
@property
def state_shape(self):
"""
States of TransformerCell cache the multi-head attention intermediate
results of history decoding steps, and have a increasing length as
decoding continued.
`state_shape` of TransformerCell is used to initialize states. It is a
list of dict where the length of list is decoder layer, and each dict
has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]`
separately. (-1 for batch size would be automatically inserted into shape).
Returns:
list: It is a list of dict where the length of list is decoder layer \
number, and each dict has `k` and `v` as keys and values are cached \
results.
"""
return [{
"k": [self.decoder.n_head, 0, self.decoder.d_key],
"v": [self.decoder.n_head, 0, self.decoder.d_value],
} for i in range(self.decoder.n_layer)]
class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
"""
Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`,
Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]`
and includes extra position data. And its `states` (caches) has increasing
length. These are not consistent with `BeamSearchDecoder`, thus subclass
`BeamSearchDecoder` to make beam search adapt to Transformer decoder.
Parameters:
cell(TransformerCell): An instance of `TransformerCell`.
start_token(int): The start token id.
end_token(int): The end token id.
beam_size(int): The beam width used in beam search.
var_dim_in_state(int): Indicate which dimension of states is variant.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, Linear
from paddle.incubate.hapi.text import TransformerDecoder
from paddle.incubate.hapi.text import TransformerCell
from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
from paddle.incubate.hapi.text import DynamicDecode
paddle.enable_dygraph()
class Embedder(fluid.dygraph.Layer):
def __init__(self):
super(Embedder, self).__init__()
self.word_embedder = Embedding(size=[1000, 128])
self.pos_embedder = Embedding(size=[500, 128])
def forward(self, word, position):
return self.word_embedder(word) + self.pos_embedder(position)
embedder = Embedder()
output_layer = Linear(128, 1000)
decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
transformer_cell = TransformerCell(decoder, embedder, output_layer)
dynamic_decoder = DynamicDecode(
TransformerBeamSearchDecoder(
transformer_cell,
start_token=0,
end_token=1,
beam_size=4,
var_dim_in_state=2),
max_step_num=10,
is_test=True)
enc_output = paddle.rand((2, 4, 128))
# cross attention bias: [batch_size, n_head, trg_len, src_len]
trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
# inputs for beam search on Transformer
caches = transformer_cell.get_initial_states(enc_output)
enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
enc_output, beam_size=4)
trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
trg_src_attn_bias, beam_size=4)
static_caches = decoder.prepare_static_cache(enc_output)
outputs = dynamic_decoder(
inits=caches,
enc_output=enc_output,
trg_src_attn_bias=trg_src_attn_bias,
static_caches=static_caches)
"""
def __init__(self, cell, start_token, end_token, beam_size,
var_dim_in_state):
super(TransformerBeamSearchDecoder,
self).__init__(cell, start_token, end_token, beam_size)
self.cell = cell
self.var_dim_in_state = var_dim_in_state
def _merge_batch_beams_with_var_dim(self, x):
"""
Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
tensor with shape `[batch_size * beam_size, ...]`.
Parameters:
x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
data type should be float32, float64, int32, int64 or bool.
Returns:
Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
data type is same as `x`.
"""
# init length of cache is 0, and it increases with decoding carrying on,
# thus need to reshape elaborately
var_dim_in_state = self.var_dim_in_state + 1 # count in beam dim
x = layers.transpose(x,
list(range(var_dim_in_state, len(x.shape))) +
list(range(0, var_dim_in_state)))
x = layers.reshape(
x, [0] * (len(x.shape) - var_dim_in_state
) + [self.batch_size * self.beam_size] +
[int(size) for size in x.shape[-var_dim_in_state + 2:]])
x = layers.transpose(
x,
list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
return x
def _split_batch_beams_with_var_dim(self, x):
"""
Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
tensor with shape `[batch_size, beam_size, ...]`.
Parameters:
x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
data type should be float32, float64, int32, int64 or bool.
Returns:
Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
data type is same as `x`.
"""
var_dim_size = layers.shape(x)[self.var_dim_in_state]
x = layers.reshape(
x, [-1, self.beam_size] +
[int(size)
for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
[int(size) for size in x.shape[self.var_dim_in_state + 1:]])
return x
def step(self, time, inputs, states, **kwargs):
"""
Perform a beam search decoding step, which uses `cell` to get probabilities,
and follows a beam search step to calculate scores and select candidate
token ids.
Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped
`[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined
position data as inputs to `cell`.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
representing the current time step number of decoding.
inputs(Variable): A tensor variable. It is same as `initial_inputs`
returned by `initialize()` for the first decoding step and
`next_inputs` returned by `step()` for the others. It is a int64
id tensor with shape `[batch_size * beam_size]`
states(Variable): A structure of tensor variables.
It is same as the `initial_states` returned by `initialize()` for
the first decoding step and `beam_search_state` returned by
`step()` for the others.
**kwargs: Additional keyword arguments, provided by the caller.
Returns:
tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
`beam_search_state` and `next_inputs` have the same structure, \
shape and data type as the input arguments `states` and `inputs` separately. \
`beam_search_output` is a namedtuple(including scores, predicted_ids, \
parent_ids as fields) of tensor variables, where \
`scores, predicted_ids, parent_ids` all has a tensor value shaped \
`[batch_size, beam_size]` with data type `float32, int64, int64`. \
`finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
"""
# compared to RNN, Transformer has 3D data at every decoding step
inputs = layers.reshape(inputs, [-1, 1]) # token
pos = layers.ones_like(inputs) * time # pos
cell_states = map_structure(self._merge_batch_beams_with_var_dim,
states.cell_states)
cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
**kwargs)
# squeeze to adapt to BeamSearchDecoder which use 2D logits
cell_outputs = map_structure(
lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x,
cell_outputs)
cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
next_cell_states)
beam_search_output, beam_search_state = self._beam_search_step(
time=time,
logits=cell_outputs,
next_cell_states=next_cell_states,
beam_state=states)
next_inputs, finished = (beam_search_output.predicted_ids,
beam_search_state.finished)
return (beam_search_output, beam_search_state, next_inputs, finished)
### Transformer Modules ###
class PrePostProcessLayer(Layer):
"""
PrePostProcessLayer is used before/after each multi-head attention(MHA) and
feed-forward network(FFN) sub-layer to perform some specific process on
inputs/outputs.
Parameters:
process_cmd (str): The process applied before/after each MHA and
FFN sub-layer. It should be a string composed of `d`, `a`, `n`,
where `d` for dropout, `a` for add residual connection, `n` for
layer normalization.
d_model (int): The expected feature size in the input and output.
dropout_rate (float): The dropout probability if the process includes
dropout. Default 0.1
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import PrePostProcessLayer
# input: [batch_size, sequence_length, d_model]
x = paddle.rand((2, 4, 32))
process = PrePostProcessLayer('n', 32)
out = process(x) # [2, 4, 32]
"""
def __init__(self, process_cmd, d_model, dropout_rate=0.1):
super(PrePostProcessLayer, self).__init__()
self.process_cmd = process_cmd
self.functors = []
for cmd in self.process_cmd:
if cmd == "a": # add residual connection
self.functors.append(lambda x, y: x + y if y is not None else x)
elif cmd == "n": # add layer normalization
layer_norm = LayerNorm(
normalized_shape=d_model,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
self.functors.append(
self.add_sublayer(
"layer_norm_%d" % len(
self.sublayers(include_sublayers=False)),
layer_norm))
elif cmd == "d": # add dropout
self.functors.append(lambda x: layers.dropout(
x, dropout_prob=dropout_rate, is_test=False)
if dropout_rate else x)
def forward(self, x, residual=None):
"""
Applies `process_cmd` specified process on `x`.
Parameters:
x (Variable): The tensor to be processed. The data type should be float32
or float64. The shape is `[batch_size, sequence_length, d_model]`.
residual (Variable, optional): Only used if the process includes
residual connection. It has the same shape and data type as `x`.
Default None
Returns:
Variable: The processed tensor. It has the same shape and data type \
as `x`.
"""
for i, cmd in enumerate(self.process_cmd):
if cmd == "a":
x = self.functors[i](x, residual)
else:
x = self.functors[i](x)
return x
class MultiHeadAttention(Layer):
"""
MultiHead Attention mapps queries and a set of key-value pairs to outputs
by jointly attending to information from different representation subspaces,
as what multi-head indicates it performs multiple attention in parallel.
Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
for more details.
Parameters:
d_key (int): The feature size to transformer queries and keys as in
multi-head attention. Mostly it equals to `d_model // n_head`.
d_value (int): The feature size to transformer values as in multi-head
attention. Mostly it equals to `d_model // n_head`.
d_model (int): The expected feature size in the input and output.
n_head (int): The number of heads in multi-head attention(MHA).
dropout_rate (float, optional): The dropout probability used in MHA to
drop some attention target. Default 0.1
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import MultiHeadAttention
# encoder input: [batch_size, sequence_length, d_model]
query = paddle.rand((2, 4, 128))
# self attention bias: [batch_size, n_head, src_len, src_len]
attn_bias = paddle.rand((2, 2, 4, 4))
multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2)
output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128]
"""
def __init__(self, d_key, d_value, d_model, n_head, dropout_rate=0.1):
super(MultiHeadAttention, self).__init__()
self.n_head = n_head
self.d_key = d_key
self.d_value = d_value
self.d_model = d_model
self.dropout_rate = dropout_rate
self.q_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
self.k_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
self.v_fc = Linear(
input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
self.proj_fc = Linear(
input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
def _prepare_qkv(self, queries, keys, values, cache=None):
"""
Prapares linear projected queries, keys and values for usage of subsequnt
multiple attention in parallel. If `cache` is not None, using cached
results to reduce redundant calculations.
Parameters:
queries (Variable): The queries for multi-head attention. It is a
tensor with shape `[batch_size, sequence_length, d_model]`. The
data type should be float32 or float64.
keys (Variable, optional): The keys for multi-head attention. It is
a tensor with shape `[batch_size, sequence_length, d_model]`. The
data type should be float32 or float64.
values (Variable, optional): The values for multi-head attention. It
is a tensor with shape `[batch_size, sequence_length, d_model]`.
The data type should be float32 or float64.
cache(dict, optional): It is a dict with `k` and `v` as keys, and
values cache the multi-head attention intermediate results of
history decoding steps for decoder self attention; Or a dict
with `static_k` and `statkc_v` as keys, and values stores intermediate
results of encoder output for decoder-encoder cross attention.
If it is for decoder self attention, values for `k` and `v` would
be updated by new tensors concatanating raw tensors with intermediate
results of current step. It is only used for inference and should
be None for training. Default None
Returns:
tuple: A tuple including linear projected keys and values. These two \
tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
and `[batch_size, n_head, sequence_length, d_value]` separately, \
and their data types are same as inputs.
"""
if keys is None: # self-attention
keys, values = queries, queries
static_kv = False
else: # cross-attention
static_kv = True
q = self.q_fc(queries)
q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
q = layers.transpose(x=q, perm=[0, 2, 1, 3])
if cache is not None and static_kv and "static_k" in cache:
# for encoder-decoder attention in inference and has cached
k = cache["static_k"]
v = cache["static_v"]
else:
k = self.k_fc(keys)
v = self.v_fc(values)
k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
k = layers.transpose(x=k, perm=[0, 2, 1, 3])
v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
v = layers.transpose(x=v, perm=[0, 2, 1, 3])
if cache is not None:
if static_kv and not "static_k" in cache:
# for encoder-decoder attention in inference and has not cached
cache["static_k"], cache["static_v"] = k, v
elif not static_kv:
# for decoder self-attention in inference
cache_k, cache_v = cache["k"], cache["v"]
k = layers.concat([cache_k, k], axis=2)
v = layers.concat([cache_v, v], axis=2)
cache["k"], cache["v"] = k, v
return q, k, v
def forward(self,
queries,
keys=None,
values=None,
attn_bias=None,
cache=None):
"""
Applies multi-head attention to map queries and a set of key-value pairs
to outputs.
Parameters:
queries (Variable): The queries for multi-head attention. It is a
tensor with shape `[batch_size, sequence_length, d_model]`. The
data type should be float32 or float64.
keys (Variable, optional): The keys for multi-head attention. It is
a tensor with shape `[batch_size, sequence_length, d_model]`. The
data type should be float32 or float64.
values (Variable, optional): The values for multi-head attention. It
is a tensor with shape `[batch_size, sequence_length, d_model]`.
The data type should be float32 or float64.
attn_bias (Variable, optional): A tensor used in multi-head attention
to mask out attention on unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
`[batch_size, n_head, sequence_length, sequence_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. The data type should be float32 or float64. It can
be None when nothing wanted or needed to be masked out. Default None
cache(dict, optional): It is a dict with `k` and `v` as keys, and
values cache the multi-head attention intermediate results of
history decoding steps for decoder self attention; Or a dict
with `static_k` and `statkc_v` as keys, and values stores intermediate
results of encoder output for decoder-encoder cross attention.
If it is for decoder self attention, values for `k` and `v` would
be updated by new tensors concatanating raw tensors with intermediate
results of current step. It is only used for inference and should
be None for training. Default None
Returns:
Variable: The output of multi-head attention. It is a tensor \
that has the same shape and data type as `queries`.
"""
# compute q ,k ,v
q, k, v = self._prepare_qkv(queries, keys, values, cache)
# scale dot product attention
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.d_key**-0.5)
if attn_bias is not None:
product += attn_bias
weights = layers.softmax(product)
if self.dropout_rate:
weights = layers.dropout(
weights, dropout_prob=self.dropout_rate, is_test=False)
out = layers.matmul(weights, v)
# combine heads
out = layers.transpose(out, perm=[0, 2, 1, 3])
out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
# project to output
out = self.proj_fc(out)
return out
def cal_kv(self, keys, values):
"""
Applies linear projection on input keys and values, then splits heads
(reshape and transpose) to get keys and values from different representation
subspaces for usage of subsequnt multiple attention in parallel.
Parameters:
keys (Variable, optional): The keys for multi-head attention. It is
a tensor with shape `[batch_size, sequence_length, d_model]`. The
data type should be float32 or float64.
values (Variable, optional): The values for multi-head attention. It
is a tensor with shape `[batch_size, sequence_length, d_model]`.
The data type should be float32 or float64.
Returns:
tuple: A tuple including linear projected keys and values. These two \
tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
and `[batch_size, n_head, sequence_length, d_value]` separately, \
and their data types are same as inputs.
"""
k = self.k_fc(keys)
v = self.v_fc(values)
k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
k = layers.transpose(x=k, perm=[0, 2, 1, 3])
v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
v = layers.transpose(x=v, perm=[0, 2, 1, 3])
return k, v
class FFN(Layer):
"""
A fully connected feed-forward network applied to each position separately
and identically. This consists of two linear transformations with a activation
and dropout in between.
Parameters:
d_inner_hid (int): The hidden size in the feedforward network(FFN).
d_model (int): The expected feature size in the input and output.
dropout_rate (float, optional): The dropout probability used after
activition. Default 0.1
ffn_fc1_act (str, optional): The activation function in the feedforward
network. Default relu.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import FFN
# input: [batch_size, sequence_length, d_model]
x = paddle.rand((2, 4, 32))
ffn = FFN(128, 32)
out = ffn(x) # [2, 4, 32]
"""
def __init__(self, d_inner_hid, d_model, dropout_rate=0.1, fc1_act="relu"):
super(FFN, self).__init__()
self.dropout_rate = dropout_rate
self.fc1 = Linear(
input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
def forward(self, x):
"""
Applies a fully connected feed-forward network on each position of the
input sequences separately and identically.
Parameters:
x (Variable): The input of feed-forward network. It is a tensor
with shape `[batch_size, sequence_length, d_model]`. The data
type should be float32 or float64.
Returns:
Variable: The output of feed-forward network. It is a tensor that has \
the same shape and data type as `enc_input`.
"""
hidden = self.fc1(x)
if self.dropout_rate:
hidden = layers.dropout(
hidden, dropout_prob=self.dropout_rate, is_test=False)
out = self.fc2(hidden)
return out
class TransformerEncoderLayer(Layer):
"""
TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
attention and feedforward network. Before and after each sub-layer, pre-process
and post-precess would be applied on the input and output.
Parameters:
n_head (int): The number of heads in multi-head attention(MHA).
d_key (int): The feature size to transformer queries and keys as in
multi-head attention. Mostly it equals to `d_model // n_head`.
d_value (int): The feature size to transformer values as in multi-head
attention. Mostly it equals to `d_model // n_head`.
d_model (int): The expected feature size in the input and output.
d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
prepostprocess_dropout (float, optional): The dropout probability used
in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
attention_dropout (float, optional): The dropout probability used
in MHA to drop some attention target. Default 0.1
relu_dropout (float, optional): The dropout probability used after FFN
activition. Default 0.1
preprocess_cmd (str, optional): The process applied before each MHA and
FFN sub-layer, and it also would be applied on output of the last
stacked layer. It should be a string composed of `d`, `a`, `n`,
where `d` for dropout, `a` for add residual connection, `n` for
layer normalization. Default `n`.
postprocess_cmd (str, optional): The process applied after each MHA and
FFN sub-layer. Same as `preprocess_cmd`. It should be a string
composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
residual connection, `n` for layer normalization. Default `da`.
ffn_fc1_act (str, optional): The activation function in the feedforward
network. Default relu.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import TransformerEncoderLayer
# encoder input: [batch_size, src_len, d_model]
enc_input = paddle.rand((2, 4, 128))
# self attention bias: [batch_size, n_head, src_len, src_len]
attn_bias = paddle.rand((2, 2, 4, 4))
encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
enc_output = encoder_layer(enc_input, attn_bias) # [2, 4, 128]
"""
def __init__(self,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
ffn_fc1_act="relu"):
super(TransformerEncoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout)
self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
attention_dropout)
self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout)
self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout)
def forward(self, enc_input, attn_bias=None):
"""
Applies a Transformer encoder layer on the input.
Parameters:
enc_input (Variable): The input of Transformer encoder layer. It is
a tensor with shape `[batch_size, sequence_length, d_model]`.
The data type should be float32 or float64.
attn_bias(Variable, optional): A tensor used in encoder self attention
to mask out attention on unwanted positions, usually the paddings. It
is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. The data type should be float32 or float64. It can
be None when nothing wanted or needed to be masked out. Default None
Returns:
Variable: The output of Transformer encoder layer. It is a tensor that \
has the same shape and data type as `enc_input`.
"""
attn_output = self.self_attn(
self.preprocesser1(enc_input), None, None, attn_bias)
attn_output = self.postprocesser1(attn_output, enc_input)
ffn_output = self.ffn(self.preprocesser2(attn_output))
ffn_output = self.postprocesser2(ffn_output, attn_output)
return ffn_output
class TransformerEncoder(Layer):
"""
TransformerEncoder is a stack of N encoder layers.
Parameters:
n_layer (int): The number of encoder layers to be stacked.
n_head (int): The number of heads in multi-head attention(MHA).
d_key (int): The feature size to transformer queries and keys as in
multi-head attention. Mostly it equals to `d_model // n_head`.
d_value (int): The feature size to transformer values as in multi-head
attention. Mostly it equals to `d_model // n_head`.
d_model (int): The expected feature size in the input and output.
d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
prepostprocess_dropout (float, optional): The dropout probability used
in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
attention_dropout (float, optional): The dropout probability used
in MHA to drop some attention target. Default 0.1
relu_dropout (float, optional): The dropout probability used after FFN
activition. Default 0.1
preprocess_cmd (str, optional): The process applied before each MHA and
FFN sub-layer, and it also would be applied on output of the last
stacked layer. It should be a string composed of `d`, `a`, `n`,
where `d` for dropout, `a` for add residual connection, `n` for
layer normalization. Default `n`.
postprocess_cmd (str, optional): The process applied after each MHA and
FFN sub-layer. Same as `preprocess_cmd`. It should be a string
composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
residual connection, `n` for layer normalization. Default `da`.
ffn_fc1_act (str, optional): The activation function in the feedforward
network. Default relu.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import TransformerEncoder
# encoder input: [batch_size, src_len, d_model]
enc_input = paddle.rand((2, 4, 128))
# self attention bias: [batch_size, n_head, src_len, src_len]
attn_bias = paddle.rand((2, 2, 4, 4))
encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
enc_output = encoder(enc_input, attn_bias) # [2, 4, 128]
"""
def __init__(self,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
ffn_fc1_act="relu"):
super(TransformerEncoder, self).__init__()
self.encoder_layers = list()
for i in range(n_layer):
self.encoder_layers.append(
self.add_sublayer(
"layer_%d" % i,
TransformerEncoderLayer(
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
ffn_fc1_act=ffn_fc1_act)))
self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout)
def forward(self, enc_input, attn_bias=None):
"""
Applies a stack of N Transformer encoder layers on input sequences.
Parameters:
enc_input (Variable): The input of Transformer encoder. It is a tensor
with shape `[batch_size, sequence_length, d_model]`. The data
type should be float32 or float64.
attn_bias(Variable, optional): A tensor used in encoder self attention
to mask out attention on unwanted positions, usually the paddings. It
is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. The data type should be float32 or float64. It can
be None when nothing wanted or needed to be masked out. Default None
Returns:
Variable: The output of Transformer encoder. It is a tensor that has \
the same shape and data type as `enc_input`.
"""
for encoder_layer in self.encoder_layers:
enc_output = encoder_layer(enc_input, attn_bias)
enc_input = enc_output
return self.processer(enc_output)
class TransformerDecoderLayer(Layer):
"""
TransformerDecoderLayer is composed of three sub-layers which are decoder
self (multi-head) attention, decoder-encoder cross attention and feedforward
network. Before and after each sub-layer, pre-process and post-precess would
be applied on the input and output.
Parameters:
n_head (int): The number of heads in multi-head attention(MHA).
d_key (int): The feature size to transformer queries and keys as in
multi-head attention. Mostly it equals to `d_model // n_head`.
d_value (int): The feature size to transformer values as in multi-head
attention. Mostly it equals to `d_model // n_head`.
d_model (int): The expected feature size in the input and output.
d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
prepostprocess_dropout (float, optional): The dropout probability used
in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
attention_dropout (float, optional): The dropout probability used
in MHA to drop some attention target. Default 0.1
relu_dropout (float, optional): The dropout probability used after FFN
activition. Default 0.1
preprocess_cmd (str, optional): The process applied before each MHA and
FFN sub-layer, and it also would be applied on output of the last
stacked layer. It should be a string composed of `d`, `a`, `n`,
where `d` for dropout, `a` for add residual connection, `n` for
layer normalization. Default `n`.
postprocess_cmd (str, optional): The process applied after each MHA and
FFN sub-layer. Same as `preprocess_cmd`. It should be a string
composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
residual connection, `n` for layer normalization. Default `da`.
ffn_fc1_act (str, optional): The activation function in the feedforward
network. Default relu.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import TransformerDecoderLayer
# decoder input: [batch_size, trg_len, d_model]
dec_input = paddle.rand((2, 4, 128))
# encoder output: [batch_size, src_len, d_model]
enc_output = paddle.rand((2, 6, 128))
# self attention bias: [batch_size, n_head, trg_len, trg_len]
self_attn_bias = paddle.rand((2, 2, 4, 4))
# cross attention bias: [batch_size, n_head, trg_len, src_len]
cross_attn_bias = paddle.rand((2, 2, 4, 6))
decoder_layer = TransformerDecoderLayer(2, 64, 64, 128, 512)
output = decoder_layer(dec_input,
enc_output,
self_attn_bias,
cross_attn_bias) # [2, 4, 128]
"""
def __init__(self,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
ffn_fc1_act="relu"):
super(TransformerDecoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout)
self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
attention_dropout)
self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout)
self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
attention_dropout)
self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout)
self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout)
self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout)
def forward(self,
dec_input,
enc_output,
self_attn_bias=None,
cross_attn_bias=None,
cache=None):
"""
Applies a Transformer decoder layer on the input.
Parameters:
dec_input (Variable): The input of Transformer decoder. It is a tensor
with shape `[batch_size, target_length, d_model]`. The data type
should be float32 or float64.
enc_output (Variable): The output of Transformer encoder. It is a tensor
with shape `[batch_size, source_length, d_model]`. The data type
should be float32 or float64.
self_attn_bias (Variable, optional): A tensor used in decoder self attention
to mask out attention on unwanted positions, usually the subsequent positions.
It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. The data type should be float32 or float64. It can
be None when nothing wanted or needed to be masked out. Default None
cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
attention to mask out attention on unwanted positions, usually the paddings.
It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. The data type should be float32 or float64. It can
be None when nothing wanted or needed to be masked out. Default None
caches(dict, optional): It caches the multi-head attention intermediate
results of history decoding steps and encoder output. It is a dict
has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
results. It is only used for inference and should be None for
training. Default None
Returns:
Variable: The output of Transformer decoder layer. It is a tensor \
that has the same shape and data type as `dec_input`.
"""
self_attn_output = self.self_attn(
self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
self_attn_output = self.postprocesser1(self_attn_output, dec_input)
cross_attn_output = self.cross_attn(
self.preprocesser2(self_attn_output), enc_output, enc_output,
cross_attn_bias, cache)
cross_attn_output = self.postprocesser2(cross_attn_output,
self_attn_output)
ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
return ffn_output
class TransformerDecoder(Layer):
"""
TransformerDecoder is a stack of N decoder layers.
Parameters:
n_layer (int): The number of encoder layers to be stacked.
n_head (int): The number of heads in multi-head attention(MHA).
d_key (int): The feature size to transformer queries and keys as in
multi-head attention. Mostly it equals to `d_model // n_head`.
d_value (int): The feature size to transformer values as in multi-head
attention. Mostly it equals to `d_model // n_head`.
d_model (int): The expected feature size in the input and output.
d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
prepostprocess_dropout (float, optional): The dropout probability used
in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
attention_dropout (float, optional): The dropout probability used
in MHA to drop some attention target. Default 0.1
relu_dropout (float, optional): The dropout probability used after FFN
activition. Default 0.1
preprocess_cmd (str, optional): The process applied before each MHA and
FFN sub-layer, and it also would be applied on output of the last
stacked layer. It should be a string composed of `d`, `a`, `n`,
where `d` for dropout, `a` for add residual connection, `n` for
layer normalization. Default `n`.
postprocess_cmd (str, optional): The process applied after each MHA and
FFN sub-layer. Same as `preprocess_cmd`. It should be a string
composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
residual connection, `n` for layer normalization. Default `da`.
ffn_fc1_act (str, optional): The activation function in the feedforward
network. Default relu.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import TransformerDecoder
# decoder input: [batch_size, trg_len, d_model]
dec_input = paddle.rand((2, 4, 128))
# encoder output: [batch_size, src_len, d_model]
enc_output = paddle.rand((2, 6, 128))
# self attention bias: [batch_size, n_head, trg_len, trg_len]
self_attn_bias = paddle.rand((2, 2, 4, 4))
# cross attention bias: [batch_size, n_head, trg_len, src_len]
cross_attn_bias = paddle.rand((2, 2, 4, 6))
decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
dec_output = decoder(dec_input,
enc_output,
self_attn_bias,
cross_attn_bias) # [2, 4, 128]
"""
def __init__(self,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
preprocess_cmd="n",
postprocess_cmd="da",
ffn_fc1_act="relu"):
super(TransformerDecoder, self).__init__()
self.n_layer = n_layer
self.n_head = n_head
self.d_key = d_key
self.d_value = d_value
self.decoder_layers = list()
for i in range(n_layer):
self.decoder_layers.append(
self.add_sublayer(
"layer_%d" % i,
TransformerDecoderLayer(n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout,
preprocess_cmd, postprocess_cmd)))
self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout)
def forward(self,
dec_input,
enc_output,
self_attn_bias=None,
cross_attn_bias=None,
caches=None):
"""
Applies a stack of N Transformer decoder layers on inputs.
Parameters:
dec_input (Variable): The input of Transformer decoder. It is a tensor
with shape `[batch_size, target_length, d_model]`. The data type
should be float32 or float64.
enc_output (Variable): The output of Transformer encoder. It is a tensor
with shape `[batch_size, source_length, d_model]`. The data type
should be float32 or float64.
self_attn_bias (Variable, optional): A tensor used in decoder self attention
to mask out attention on unwanted positions, usually the subsequent positions.
It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. The data type should be float32 or float64. It can
be None when nothing wanted or needed to be masked out. Default None
cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
attention to mask out attention on unwanted positions, usually the paddings.
It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. The data type should be float32 or float64. It can
be None when nothing wanted or needed to be masked out. Default None
caches(list, optional): It caches the multi-head attention intermediate results
of history decoding steps and encoder output. It is a list of dict
where the length of list is decoder layer number, and each dict
has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
results. It is only used for inference and should be None for
training. Default None
Returns:
Variable: The output of Transformer decoder. It is a tensor that has \
the same shape and data type as `dec_input`.
"""
for i, decoder_layer in enumerate(self.decoder_layers):
dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
cross_attn_bias, caches[i]
if caches else None)
dec_input = dec_output
return self.processer(dec_output)
def prepare_static_cache(self, enc_output):
"""
Generate a list of dict where the length of list is decoder layer number.
Each dict has `static_k`, `statkc_v` as keys, and values are projected
results of encoder output to be used as keys and values in decoder-encoder
cross (multi-head) attention. Used in inference.
Parameters:
enc_output (Variable): The output of Transformer encoder. It is a tensor
with shape `[batch_size, source_length, d_model]`. The data type
should be float32 or float64.
Returns:
list: A list of dict. Each dict has `static_k`, `statkc_v` as keys, \
and values are projected results of encoder output to be used as \
keys and values in decoder-encoder cross (multi-head) attention.
"""
return [
dict(
zip(("static_k", "static_v"),
decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
for decoder_layer in self.decoder_layers
]
def prepare_incremental_cache(self, enc_output):
"""
Generate a list of dict where the length of list is decoder layer number.
Each dict has `k`, `v` as keys, and values are empty tensors with shape
`[batch_size, n_head, 0, d_key]` and `[batch_size, n_head, 0, d_value]`,
representing the decoder self (multi-head) attention intermediate results,
and 0 is the initial length which would increase as inference decoding
continued. Used in inference.
Parameters:
enc_output (Variable): The output of Transformer encoder. It is a tensor
with shape `[batch_size, source_length, d_model]`. The data type
should be float32 or float64. Actually, it is used to provide batch
size for Transformer initial states(caches), thus any tensor has
wanted batch size can be used here.
Returns:
list: A list of dict. Each dict has `k`, `v` as keys, and values are \
empty tensors representing intermediate results of history decoding \
steps in decoder self (multi-head) attention at time step 0.
"""
return [{
"k": layers.fill_constant_batch_size_like(
input=enc_output,
shape=[-1, self.n_head, 0, self.d_key],
dtype=enc_output.dtype,
value=0),
"v": layers.fill_constant_batch_size_like(
input=enc_output,
shape=[-1, self.n_head, 0, self.d_value],
dtype=enc_output.dtype,
value=0),
} for i in range(self.n_layer)]
class LinearChainCRF(Layer):
"""
Computes the negtive log-likelihood of tag sequences in a linear chain CRF.
Using terminologies of undirected probabilistic graph model, it calculates
probability using unary potentials (for emission) and binary potentials
(for transition).
This layer creates a learnable parameter shaped `[size + 2, size]` (`size`
is for the number of tags), where:
1. the first row is for starting weights, denoted as $a$ here
2. the second row is for ending weights, denoted as $b$ here.
3. the remaining rows is a matrix for transition weights.
Denote input tensor of unary potentials(emission) as $x$ , then the probability
of a tag sequence $s$ of length $L$ is defined as:
$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+ \sum_{l=1}^L x_{s_l}
+ \sum_{l=2}^L w_{s_{l-1},s_l})$$
where $Z$ is a normalization value so that the sum of $P(s)$ over
all possible sequences is 1, and $x$ is the emission feature weight
to the linear chain CRF.
This operator implements the Forward-Backward algorithm for the linear chain
CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
NOTE:
1. The feature function for a CRF is made up of the emission features and the
transition features. The emission feature weights are NOT computed in
this operator. They MUST be computed first before this operator is called.
2. Because this operator performs global normalization over all possible
sequences internally, it expects UNSCALED emission feature weights.
Please do not call this op with the emission feature being output of any
nonlinear activation.
3. The 2nd dimension of input(emission) MUST be equal to the tag number.
Parameters:
size (int): The number of tags.
param_attr (ParamAttr, optional): The attribute of the learnable parameter for
transition. Default: None
dtype (str, optional): Data type, it can be 'float32' or 'float64'.
Default: `float32`
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import LinearChainCRF
# emission: [batch_size, sequence_length, num_tags]
emission = paddle.rand((2, 8, 5))
# label: [batch_size, sequence_length, num_tags]
# dummy label just for example usage
label = paddle.ones((2, 8), dtype='int64')
length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
crf = LinearChainCRF(size=5)
cost = crf(emission, label, length) # [2, 1]
"""
def __init__(self, size, param_attr=None, dtype='float32'):
super(LinearChainCRF, self).__init__()
self._param_attr = param_attr
self._dtype = dtype
self._size = size
self._transition = self.create_parameter(
attr=self._param_attr,
shape=[self._size + 2, self._size],
dtype=self._dtype)
@property
def weight(self):
"""
getter for transition matrix parameter
Returns:
Parameter: The learnable transition parameter shaped `[size + 2, size]` \
(`size` is for the number of tags). The data type should be float32 \
or float64.
"""
return self._transition
@weight.setter
def weight(self, value):
"""
setter for transition matrix parameter
Parameters:
value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
(`size` is for the number of tags). The data type should be float32 \
or float64.
"""
self._transition = value
def forward(self, input, label, length):
"""
Computes the log-likelihood of tag sequences in a linear chain CRF.
Parameters:
input (Variable): The input of unary potentials(emission). It is a
tensor with shape `[batch_size, sequence_length, num_tags]`.
The data type should be float32 or float64.
label (Variable): The golden sequence tags. It is a tensor
with shape `[batch_size, sequence_length]`. The data type
should be int64.
length (Variable): A tensor with shape `[batch_size]`. It stores real
length of each sequence for correctness.
Returns:
Variable: The negtive log-likelihood of tag sequences. It is a tensor \
with shape `[batch_size, 1]` and has float32 or float64 data type.
"""
alpha = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
emission_exps = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
transition_exps = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
log_likelihood = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
this_inputs = {
"Emission": [input],
"Transition": self._transition,
"Label": [label]
}
if length is not None:
this_inputs['Length'] = [length]
self._helper.append_op(
type='linear_chain_crf',
inputs=this_inputs,
outputs={
"Alpha": [alpha],
"EmissionExps": [emission_exps],
"TransitionExps": transition_exps,
"LogLikelihood": log_likelihood
})
return log_likelihood
class CRFDecoding(Layer):
"""
CRFDecoding reads the emission feature weights and the transition
feature weights learned by the `LinearChainCRF` and performs decoding.
It implements the Viterbi algorithm which is a dynamic programming algorithm
for finding the most likely sequence of hidden states, called the Viterbi path,
that results in a sequence of observed tags.
The output of this layer changes according to whether `label` is given:
1. `label` is given:
This happens in training. This operator is used to co-work with the chunk_eval
operator. When `label` is given, it returns tensor with the same shape as
`label` whose values are fixed to be 0, indicating an incorrect prediction,
or 1 indicating a tag is correctly predicted. Such an output is the input to
chunk_eval operator.
2. `label` is not given:
This is the standard decoding process and get the highest scoring sequence
of tags.
Parameters:
size (int): The number of tags.
param_attr (ParamAttr, optional): The attribute of the learnable parameter for
transition. Default: None
dtype (str, optional): Data type, it can be 'float32' or 'float64'.
Default: `float32`
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import CRFDecoding
# emission: [batch_size, sequence_length, num_tags]
emission = paddle.rand((2, 8, 5))
length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
crf_decoding = CRFDecoding(size=5)
cost = crf_decoding(emission, length) # [2, 8]
"""
def __init__(self, size, param_attr=None, dtype='float32'):
super(CRFDecoding, self).__init__()
self._dtype = dtype
self._size = size
self._param_attr = param_attr
self._transition = self.create_parameter(
attr=self._param_attr,
shape=[self._size + 2, self._size],
dtype=self._dtype)
@property
def weight(self):
"""
getter for transition matrix parameter
Returns:
Parameter: The learnable transition parameter shaped `[size + 2, size]` \
(`size` is for the number of tags). The data type should be float32 \
or float64.
"""
return self._transition
@weight.setter
def weight(self, value):
"""
setter for transition matrix parameter
Parameters:
value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
(`size` is for the number of tags). The data type should be float32 \
or float64.
"""
self._transition = value
def forward(self, input, length, label=None):
"""
Performs sequence tagging prediction.
Parameters:
input (Variable): The input of unary potentials(emission). It is a
tensor with shape `[batch_size, sequence_length, num_tags]`.
The data type should be float32 or float64.
length (Variable): A tensor with shape `[batch_size]`.
It stores real length of each sequence for correctness.
label (Variable, optional): The golden sequence tags. It is a tensor
with shape `[batch_size, sequence_length]`. The data type
should be int64. Default None.
Returns:
Variable: A tensor with shape `[batch_size, sequence_length]` and \
int64 data type. If `label` is None, the tensor has binary values \
indicating a correct or incorrect prediction. Otherwise its values \
range from 0 to maximum tag number - 1, each element indicates \
an index of a predicted tag.
"""
viterbi_path = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
this_inputs = {
"Emission": [input],
"Transition": self._transition,
"Label": label
}
if length is not None:
this_inputs['Length'] = [length]
self._helper.append_op(
type='crf_decoding',
inputs=this_inputs,
outputs={"ViterbiPath": [viterbi_path]})
return viterbi_path
class _GRUEncoder(Layer):
"""
A multi-layer bidirectional GRU encoder used by SequenceTagging.
"""
def __init__(self,
input_dim,
grnn_hidden_dim,
init_bound,
num_layers=1,
is_bidirection=False):
super(_GRUEncoder, self).__init__()
self.num_layers = num_layers
self.is_bidirection = is_bidirection
self.gru_list = []
self.gru_r_list = []
for i in range(num_layers):
self.basic_gru_cell = BasicGRUCell(
input_size=input_dim if i == 0 else input_dim * 2,
hidden_size=grnn_hidden_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.gru_list.append(
self.add_sublayer(
"gru_%d" % i,
RNN(self.basic_gru_cell, is_reverse=False,
time_major=False)))
if self.is_bidirection:
for i in range(num_layers):
self.basic_gru_cell_r = BasicGRUCell(
input_size=input_dim if i == 0 else input_dim * 2,
hidden_size=grnn_hidden_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.gru_r_list.append(
self.add_sublayer(
"gru_r_%d" % i,
RNN(self.basic_gru_cell_r,
is_reverse=True,
time_major=False)))
def forward(self, input_feature, h0=None):
for i in range(self.num_layers):
pre_gru, pre_state = self.gru_list[i](input_feature)
if self.is_bidirection:
gru_r, r_state = self.gru_r_list[i](input_feature)
out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
else:
out = pre_gru
input_feature = out
return out
class SequenceTagging(Layer):
"""
Sequence tagging model using multi-layer bidirectional GRU as backbone and
linear chain CRF as output layer.
Parameters:
vocab_size (int): The size of vocabulary.
num_labels (int): The number of labels.
word_emb_dim (int, optional): The embedding size. Defalut 128
grnn_hidden_dim (int, optional): The hidden size of GRU. Defalut 128
emb_learning_rate (int, optional): The partial learning rate for embedding.
The actual learning rate for embedding would multiply it with the global
learning rate. Default 0.1
crf_learning_rate (int, optional): The partial learning rate for crf. The
actual learning rate for embedding would multiply it with the global
learning rate. Default 0.1
bigru_num (int, optional): The number of bidirectional GRU layers.
Default 2
init_bound (float, optional): The range for uniform initializer would
be `(-init_bound, init_bound)`. It would be used for all parameters
except CRF transition matrix. Default 0.1
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.incubate.hapi.text import SequenceTagging
# word: [batch_size, sequence_length]
# dummy input just for example
word = paddle.ones((2, 8), dtype='int64')
length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
seq_tagger = SequenceTagging(vocab_size=100, num_labels=5)
outputs = seq_tagger(word, length)
"""
def __init__(self,
vocab_size,
num_labels,
word_emb_dim=128,
grnn_hidden_dim=128,
emb_learning_rate=0.1,
crf_learning_rate=0.1,
bigru_num=2,
init_bound=0.1):
super(SequenceTagging, self).__init__()
self.word_emb_dim = word_emb_dim
self.vocab_size = vocab_size
self.num_labels = num_labels
self.grnn_hidden_dim = grnn_hidden_dim
self.emb_lr = emb_learning_rate
self.crf_lr = crf_learning_rate
self.bigru_num = bigru_num
self.init_bound = 0.1
self.word_embedding = Embedding(
size=[self.vocab_size, self.word_emb_dim],
dtype='float32',
param_attr=fluid.ParamAttr(
learning_rate=self.emb_lr,
name="word_emb",
initializer=fluid.initializer.Uniform(
low=-self.init_bound, high=self.init_bound)))
self.gru_encoder = _GRUEncoder(
input_dim=self.grnn_hidden_dim,
grnn_hidden_dim=self.grnn_hidden_dim,
init_bound=self.init_bound,
num_layers=self.bigru_num,
is_bidirection=True)
self.fc = Linear(
input_dim=self.grnn_hidden_dim * 2,
output_dim=self.num_labels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-self.init_bound, high=self.init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.linear_chain_crf = LinearChainCRF(
param_attr=fluid.ParamAttr(
name='linear_chain_crfw', learning_rate=self.crf_lr),
size=self.num_labels)
self.crf_decoding = CRFDecoding(
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=self.crf_lr),
size=self.num_labels)
def forward(self, word, lengths, target=None):
"""
Performs sequence tagging. If `target` is None, it is for training and
loss would be returned, otherwise it is for inference and returns the
predicted tags.
Parameters:
word (Variable): The input sequences to be labeled. It is a tensor
with shape `[batch_size, sequence_length]`. The data type should
be int64.
lengths (Variable): A tensor with shape `[batch_size]`. It stores real
length of each sequence.
target (Variable, optional): The golden sequence tags. It is a tensor
with shape `[batch_size, sequence_length]`. The data type
should be int64. It could be None for inference. Default None.
Returns:
tuple: A tuple( :code:`(crf_decode, avg_cost, lengths)` ) If input \
argument `target` is provided, including the most likely sequence \
tags, the averaged CRF cost and the sequence lengths, the shapes \
are `[batch_size, sequence_length]`, `[1]` and `[batch_size]`, \
and the data types are int64, float32 and int64. Otherwise A \
tuple( :code:`(crf_decode, lengths)` ) for inference.
"""
word_embed = self.word_embedding(word)
input_feature = word_embed
bigru_output = self.gru_encoder(input_feature)
emission = self.fc(bigru_output)
if target is not None:
crf_cost = self.linear_chain_crf(
input=emission, label=target, length=lengths)
avg_cost = fluid.layers.mean(x=crf_cost)
self.crf_decoding.weight = self.linear_chain_crf.weight
crf_decode = self.crf_decoding(input=emission, length=lengths)
return crf_decode, avg_cost, lengths
else:
self.linear_chain_crf.weight = self.crf_decoding.weight
crf_decode = self.crf_decoding(input=emission, length=lengths)
return crf_decode, lengths
......@@ -183,6 +183,7 @@ packages=['paddle',
'paddle.incubate.hapi.vision',
'paddle.incubate.hapi.vision.models',
'paddle.incubate.hapi.vision.transforms',
'paddle.incubate.hapi.text',
'paddle.io',
'paddle.nn',
'paddle.nn.functional',
......
......@@ -334,6 +334,34 @@
"ParallelEnv",
"DataParallel",
"DataParallel.scale_loss",
"DataParallel.apply_collective_grads"
"DataParallel.apply_collective_grads",
"BasicLSTMCell.forward",
"BasicGRUCell.forward",
"RNN.forward",
"StackedRNNCell.forward",
"StackedLSTMCell.forward",
"LSTM.forward",
"BidirectionalRNN.forward",
"BidirectionalLSTM.forward",
"StackedGRUCell.forward",
"GRU.forward",
"BidirectionalGRU.forward",
"DynamicDecode.forward",
"Conv1dPoolLayer.forward",
"CNNEncoder.forward",
"TransformerCell.forward",
"TransformerBeamSearchDecoder.step",
"MultiHeadAttention.forward",
"MultiHeadAttention.cal_kv",
"FFN.forward",
"TransformerEncoderLayer.forward",
"TransformerEncoder.forward",
"TransformerDecoderLayer.forward",
"TransformerDecoder.forward",
"TransformerDecoder.prepare_static_cache",
"TransformerDecoder.prepare_incremental_cache",
"LinearChainCRF.forward",
"CRFDecoding.forward",
"SequenceTagging.forward"
]
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册