diff --git a/examples/sentiment_classification/models.py b/examples/sentiment_classification/models.py index 313b928778f64001c5b37888bc546d6ff33bd970..1816ba4ea55abadc680cecf0e2f81b251a68fba8 100644 --- a/examples/sentiment_classification/models.py +++ b/examples/sentiment_classification/models.py @@ -16,12 +16,12 @@ from paddle.fluid.dygraph.nn import Linear, Embedding from paddle.fluid.dygraph.base import to_variable import numpy as np from hapi.model import Model -from hapi.text.text import GRUEncoderLayer as BiGRUEncoder +from hapi.text.text import GRUEncoder as BiGRUEncoder from hapi.text.test import BOWEncoder, CNNEncoder, GRUEncoder class CNN(Model): - def __init__(self, dict_dim, batch_size, seq_len): + def __init__(self, dict_dim, batch_size, seq_len): super(CNN, self).__init__() self.dict_dim = dict_dim self.emb_dim = 128 @@ -36,15 +36,19 @@ class CNN(Model): dict_size=self.dict_dim + 1, emb_dim=self.emb_dim, seq_len=self.seq_len, - filter_size= self.win_size, - num_filters= self.hid_dim, - hidden_dim= self.hid_dim, + filter_size=self.win_size, + num_filters=self.hid_dim, + hidden_dim=self.hid_dim, padding_idx=None, act='tanh') - self._fc1 = Linear(input_dim = self.hid_dim*self.seq_len, output_dim=self.fc_hid_dim, act="softmax") - self._fc_prediction = Linear(input_dim = self.fc_hid_dim, - output_dim = self.class_dim, - act="softmax") + self._fc1 = Linear( + input_dim=self.hid_dim * self.seq_len, + output_dim=self.fc_hid_dim, + act="softmax") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") def forward(self, inputs): conv_3 = self._encoder(inputs) @@ -69,11 +73,14 @@ class BOW(Model): padding_idx=None, bow_dim=self.hid_dim, seq_len=self.seq_len) - self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim, act="tanh") - self._fc2 = Linear(input_dim = self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") - self._fc_prediction = Linear(input_dim = self.fc_hid_dim, - output_dim = self.class_dim, - act="softmax") + self._fc1 = Linear( + input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh") + self._fc2 = Linear( + input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") def forward(self, inputs): bow_1 = self._encoder(inputs) @@ -94,10 +101,12 @@ class GRU(Model): self.class_dim = 2 self.batch_size = batch_size self.seq_len = seq_len - self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") - self._fc_prediction = Linear(input_dim=self.fc_hid_dim, - output_dim=self.class_dim, - act="softmax") + self._fc1 = Linear( + input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") self._encoder = GRUEncoder( dict_size=self.dict_dim + 1, emb_dim=self.emb_dim, @@ -112,7 +121,7 @@ class GRU(Model): prediction = self._fc_prediction(fc_1) return prediction - + class BiGRU(Model): def __init__(self, dict_dim, batch_size, seq_len): super(BiGRU, self).__init__() @@ -130,11 +139,13 @@ class BiGRU(Model): is_sparse=False) h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") h_0 = to_variable(h_0) - self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim*3) - self._fc2 = Linear(input_dim = self.hid_dim*2, output_dim=self.fc_hid_dim, act="tanh") - self._fc_prediction = Linear(input_dim=self.fc_hid_dim, - output_dim=self.class_dim, - act="softmax") + self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3) + self._fc2 = Linear( + input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") self._encoder = BiGRUEncoder( grnn_hidden_dim=self.hid_dim, input_dim=self.hid_dim * 3, @@ -144,7 +155,8 @@ class BiGRU(Model): def forward(self, inputs): emb = self.embedding(inputs) - emb = fluid.layers.reshape(emb, shape=[self.batch_size, -1, self.hid_dim]) + emb = fluid.layers.reshape( + emb, shape=[self.batch_size, -1, self.hid_dim]) fc_1 = self._fc1(emb) encoded_vector = self._encoder(fc_1) encoded_vector = fluid.layers.tanh(encoded_vector) diff --git a/examples/transformer/transformer.py b/examples/transformer/transformer.py index b2ec120713eddcde0637debf6cbacf0b89aa0f57..179dc17ad7aa21136a081da63a7893e087114337 100644 --- a/examples/transformer/transformer.py +++ b/examples/transformer/transformer.py @@ -21,7 +21,7 @@ import paddle.fluid.layers as layers from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay from hapi.model import Model, CrossEntropy, Loss -from hapi.text import TransformerCell, TransformerBeamSearchDecoder, DynamicDecode +from hapi.text import TransformerBeamSearchDecoder, DynamicDecode def position_encoding_init(n_position, d_pos_vec): @@ -606,6 +606,27 @@ class Transformer(Model): return predict +class TransformerCell(Layer): + """ + Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be + used as RNNCell + """ + + def __init__(self, decoder): + super(TransformerCell, self).__init__() + self.decoder = decoder + + def forward(self, inputs, states, trg_src_attn_bias, enc_output, + static_caches): + trg_word, trg_pos = inputs + for cache, static_cache in zip(states, static_caches): + cache.update(static_cache) + logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, + enc_output, states) + new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states] + return logits, new_states + + class InferTransformer(Transformer): """ model for prediction diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py index 46efbf648ac817de85a5e464344f8a8f32e76313..eca5fda0a4927641a1de7aa114dd6cf7e47ab304 100644 --- a/hapi/tests/test_text.py +++ b/hapi/tests/test_text.py @@ -25,8 +25,8 @@ from paddle.fluid.dygraph import Embedding, Linear, Layer from paddle.fluid.layers import BeamSearchDecoder import hapi.text as text from hapi.model import Model, Input, set_device -from hapi.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder -from hapi.text import * +# from hapi.text.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder, TransformerCell +from hapi.text.text import * def sigmoid(x): @@ -187,7 +187,7 @@ class TestBasicLSTM(ModuleApiTest): Input( [None, None, self.inputs[-1].shape[-1]], "float32", - name="input") + name="input"), ] return inputs @@ -216,7 +216,7 @@ class TestBasicGRU(ModuleApiTest): Input( [None, None, self.inputs[-1].shape[-1]], "float32", - name="input") + name="input"), ] return inputs @@ -270,10 +270,9 @@ class TestBeamSearch(ModuleApiTest): Input( [None, self.inputs[0].shape[-1]], "float32", - name="init_hidden"), Input( - [None, self.inputs[1].shape[-1]], - "float32", - name="init_cell") + name="init_hidden"), + Input( + [None, self.inputs[1].shape[-1]], "float32", name="init_cell"), ] return inputs @@ -328,10 +327,11 @@ class TestTransformerEncoder(ModuleApiTest): Input( [None, None, self.inputs[0].shape[-1]], "float32", - name="enc_input"), Input( - [None, self.inputs[1].shape[1], None, None], - "float32", - name="attn_bias") + name="enc_input"), + Input( + [None, self.inputs[1].shape[1], None, None], + "float32", + name="attn_bias"), ] return inputs @@ -395,16 +395,19 @@ class TestTransformerDecoder(TestTransformerEncoder): Input( [None, None, self.inputs[0].shape[-1]], "float32", - name="dec_input"), Input( - [None, None, self.inputs[0].shape[-1]], - "float32", - name="enc_output"), Input( - [None, self.inputs[-1].shape[1], None, None], - "float32", - name="self_attn_bias"), Input( - [None, self.inputs[-1].shape[1], None, None], - "float32", - name="cross_attn_bias") + name="dec_input"), + Input( + [None, None, self.inputs[0].shape[-1]], + "float32", + name="enc_output"), + Input( + [None, self.inputs[-1].shape[1], None, None], + "float32", + name="self_attn_bias"), + Input( + [None, self.inputs[-1].shape[1], None, None], + "float32", + name="cross_attn_bias"), ] return inputs @@ -414,16 +417,21 @@ class TestTransformerDecoder(TestTransformerEncoder): class TestTransformerBeamSearchDecoder(ModuleApiTest): def setUp(self): - shape = (8, 32) self.inputs = [ - np.random.random(shape).astype("float32"), - np.random.random(shape).astype("float32") + # encoder output: [batch_size, seq_len, hidden_size] + np.random.random([2, 5, 128]).astype("float32"), + # cross attention bias: [batch_size, n_head, seq_len, seq_len] + np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9 ] self.outputs = None self.attrs = { "vocab_size": 100, - "embed_dim": 32, - "hidden_size": 32, + "n_layer": 2, + "n_head": 2, + "d_key": 64, + "d_value": 64, + "d_model": 128, + "d_inner_hid": 128 } self.param_states = {} @@ -445,13 +453,24 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest): eos_id=1, beam_size=4, max_step_num=20): - embedder = Embedding(size=[vocab_size, d_model]) + self.beam_size = beam_size + + def embeder_init(self, size): + Layer.__init__(self) + self.embedder = Embedding(size) + + Embedder = type("Embedder", (Layer, ), { + "__init__": embeder_init, + "forward": lambda self, word, pos: self.embedder(word) + }) + embedder = Embedder(size=[vocab_size, d_model]) output_layer = Linear(d_model, vocab_size) - decoder = TransformerDecoder(n_layer, n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, - attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd) - transformer_cell = TransformerCell(decoder) + self.decoder = TransformerDecoder( + n_layer, n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, relu_dropout, + preprocess_cmd, postprocess_cmd) + transformer_cell = TransformerCell(self.decoder, embedder, + output_layer) self.beam_search_decoder = DynamicDecode( TransformerBeamSearchDecoder( transformer_cell, @@ -464,23 +483,12 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest): @staticmethod def model_forward(self, enc_output, trg_src_attn_bias): - caches = [{ - "k": layers.fill_constant_batch_size_like( - input=enc_output, - shape=[-1, self.n_head, 0, self.d_key], - dtype=enc_output.dtype, - value=0), - "v": layers.fill_constant_batch_size_like( - input=enc_output, - shape=[-1, self.n_head, 0, self.d_value], - dtype=enc_output.dtype, - value=0), - } for i in range(self.n_layer)] + caches = self.decoder.prepare_incremental_cache(enc_output) enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( enc_output, self.beam_size) trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( trg_src_attn_bias, self.beam_size) - static_caches = self.decoder.decoder.prepare_static_cache(enc_output) + static_caches = self.decoder.prepare_static_cache(enc_output) rs, _ = self.beam_search_decoder( inits=caches, enc_output=enc_output, @@ -491,12 +499,42 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest): def make_inputs(self): inputs = [ Input( - [None, self.inputs[0].shape[-1]], + [None, None, self.inputs[0].shape[-1]], + "float32", + name="enc_output"), + Input( + [None, self.inputs[1].shape[1], None, None], + "float32", + name="trg_src_attn_bias"), + ] + return inputs + + def test_check_output(self): + self.check_output() + + +class TestSequenceTagging(ModuleApiTest): + def setUp(self): + shape = (2, 4, 128) + self.inputs = [np.random.random(shape).astype("float32")] + self.outputs = None + self.attrs = {"input_size": 128, "hidden_size": 128} + self.param_states = {} + + @staticmethod + def model_init(self, input_size, hidden_size): + self.module = SequenceTagging(input_size, hidden_size) + + @staticmethod + def model_forward(self, inputs): + return self.gru(inputs)[0] + + def make_inputs(self): + inputs = [ + Input( + [None, None, self.inputs[-1].shape[-1]], "float32", - name="init_hidden"), Input( - [None, self.inputs[1].shape[-1]], - "float32", - name="init_cell") + name="input"), ] return inputs diff --git a/hapi/text/__init__.py b/hapi/text/__init__.py index 2177ada5c0c7135e3feea0772b609d0ab29a7ba2..890e9891b9bcabea3d0f55f97c5123b1146a2622 100644 --- a/hapi/text/__init__.py +++ b/hapi/text/__init__.py @@ -28,6 +28,6 @@ from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearch from hapi.text.text import GRUCell as GRUCell from hapi.text.text import GRUEncoderCell as GRUEncoderCell from hapi.text.text import BiGRU as BiGRU -from hapi.text.text import Linear_chain_crf as Linear_chain_crf -from hapi.text.text import Crf_decoding as Crf_decoding +from hapi.text.text import LinearChainCRF as LinearChainCRF +from hapi.text.text import CRFDecoding as CRFDecoding from hapi.text.text import SequenceTagging as SequenceTagging diff --git a/hapi/text/text.py b/hapi/text/text.py index 0a382cd4fcb2523c37589c89a2f2f46bf21a2399..83327000c99e7091f0f7dc9df0e58349aa43b75b 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -49,7 +49,7 @@ __all__ = [ 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', 'TransformerDecoder', 'TransformerCell', 'TransformerBeamSearchDecoder', - 'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer' + 'LinearChainCRF', 'CRFDecoding', 'SequenceTagging', 'GRUEncoder' ] @@ -1008,18 +1008,38 @@ class TransformerCell(Layer): used as RNNCell """ - def __init__(self, decoder): + def __init__(self, decoder, embedding_fn=None, output_fn=None): + super(TransformerCell, self).__init__() self.decoder = decoder + self.embedding_fn = embedding_fn + self.output_fn = output_fn - def __call__(self, inputs, states, trg_src_attn_bias, enc_output, - static_caches): + def forward(self, inputs, states, trg_src_attn_bias, enc_output, + static_caches): trg_word, trg_pos = inputs for cache, static_cache in zip(states, static_caches): cache.update(static_cache) - logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, - enc_output, states) + if self.embedding_fn is not None: + dec_input = self.embedding_fn(trg_word, trg_pos) + outputs = self.decoder(dec_input, enc_output, None, + trg_src_attn_bias, states) + else: + outputs = self.decoder(trg_word, trg_pos, enc_output, None, + trg_src_attn_bias, states) + if self.output_fn is not None: + outputs = self.output_fn(outputs) + if len(outputs.shape) == 3: + # squeeze to adapt to BeamSearchDecoder which use 2D logits + outputs = layers.squeeze(outputs, [1]) new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states] - return logits, new_states + return outputs, new_states + + @property + def state_shape(self): + return [{ + "k": [self.n_head, 0, self.d_key], + "v": [self.n_head, 0, self.d_value], + } for i in range(len(self.n_layer))] class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): @@ -1521,6 +1541,11 @@ class TransformerDecoder(Layer): preprocess_cmd, postprocess_cmd): super(TransformerDecoder, self).__init__() + self.n_layer = n_layer + self.n_head = n_head + self.d_key = d_key + self.d_value = d_value + self.decoder_layers = list() for i in range(n_layer): self.decoder_layers.append( @@ -1555,6 +1580,20 @@ class TransformerDecoder(Layer): for decoder_layer in self.decoder_layers ] + def prepare_incremental_cache(self, enc_output): + return [{ + "k": layers.fill_constant_batch_size_like( + input=enc_output, + shape=[-1, self.n_head, 0, self.d_key], + dtype=enc_output.dtype, + value=0), + "v": layers.fill_constant_batch_size_like( + input=enc_output, + shape=[-1, self.n_head, 0, self.d_value], + dtype=enc_output.dtype, + value=0), + } for i in range(self.n_layer)] + #TODO: we should merge GRUCell with BasicGRUCell class GRUCell(RNNCell): @@ -1651,9 +1690,9 @@ class BiGRU(fluid.dygraph.Layer): return bi_merge -class Linear_chain_crf(fluid.dygraph.Layer): +class LinearChainCRF(Layer): def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): - super(Linear_chain_crf, self).__init__() + super(LinearChainCRF, self).__init__() self._param_attr = param_attr self._dtype = dtype @@ -1702,9 +1741,9 @@ class Linear_chain_crf(fluid.dygraph.Layer): return log_likelihood -class Crf_decoding(fluid.dygraph.Layer): +class CRFDecoding(Layer): def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): - super(Crf_decoding, self).__init__() + super(CRFDecoding, self).__init__() self._dtype = dtype self._size = size @@ -1742,7 +1781,7 @@ class Crf_decoding(fluid.dygraph.Layer): return viterbi_path -class GRUEncoderLayer(Layer): +class GRUEncoder(Layer): def __init__(self, input_dim, grnn_hidden_dim, @@ -1750,7 +1789,7 @@ class GRUEncoderLayer(Layer): num_layers=1, h_0=None, is_bidirection=False): - super(GRUEncoderLayer, self).__init__() + super(GRUEncoder, self).__init__() self.h_0 = h_0 self.num_layers = num_layers self.is_bidirection = is_bidirection @@ -1849,7 +1888,7 @@ class SequenceTagging(fluid.dygraph.Layer): force_cpu=True, name='h_0') - self.gru_encoder = GRUEncoderLayer( + self.gru_encoder = GRUEncoder( input_dim=self.grnn_hidden_dim, grnn_hidden_dim=self.grnn_hidden_dim, init_bound=self.init_bound, @@ -1866,12 +1905,12 @@ class SequenceTagging(fluid.dygraph.Layer): regularizer=fluid.regularizer.L2DecayRegularizer( regularization_coeff=1e-4))) - self.linear_chain_crf = Linear_chain_crf( + self.linear_chain_crf = LinearChainCRF( param_attr=fluid.ParamAttr( name='linear_chain_crfw', learning_rate=self.crf_lr), size=self.num_labels) - self.crf_decoding = Crf_decoding( + self.crf_decoding = CRFDecoding( param_attr=fluid.ParamAttr( name='crfw', learning_rate=self.crf_lr), size=self.num_labels)