From eb20b652658a256c5bb6f026bebc2f66b44c11bf Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 30 Apr 2020 18:15:00 +0800
Subject: [PATCH] Add more unit tests for apis in text.py. Rename some apis in
 text.py.

---
 examples/sentiment_classification/models.py |  62 +++++----
 examples/transformer/transformer.py         |  23 +++-
 hapi/tests/test_text.py                     | 140 +++++++++++++-------
 hapi/text/__init__.py                       |   4 +-
 hapi/text/text.py                           |  71 +++++++---
 5 files changed, 205 insertions(+), 95 deletions(-)

diff --git a/examples/sentiment_classification/models.py b/examples/sentiment_classification/models.py
index 313b928..1816ba4 100644
--- a/examples/sentiment_classification/models.py
+++ b/examples/sentiment_classification/models.py
@@ -16,12 +16,12 @@ from paddle.fluid.dygraph.nn import Linear, Embedding
 from paddle.fluid.dygraph.base import to_variable
 import numpy as np
 from hapi.model import Model
-from hapi.text.text import GRUEncoderLayer as BiGRUEncoder
+from hapi.text.text import GRUEncoder as BiGRUEncoder
 from hapi.text.test import BOWEncoder, CNNEncoder, GRUEncoder
 
 
 class CNN(Model):
-    def __init__(self,  dict_dim, batch_size, seq_len):
+    def __init__(self, dict_dim, batch_size, seq_len):
         super(CNN, self).__init__()
         self.dict_dim = dict_dim
         self.emb_dim = 128
@@ -36,15 +36,19 @@ class CNN(Model):
             dict_size=self.dict_dim + 1,
             emb_dim=self.emb_dim,
             seq_len=self.seq_len,
-            filter_size= self.win_size,
-            num_filters= self.hid_dim,
-            hidden_dim= self.hid_dim,
+            filter_size=self.win_size,
+            num_filters=self.hid_dim,
+            hidden_dim=self.hid_dim,
             padding_idx=None,
             act='tanh')
-        self._fc1 = Linear(input_dim = self.hid_dim*self.seq_len, output_dim=self.fc_hid_dim, act="softmax")
-        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
-                                 output_dim = self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim * self.seq_len,
+            output_dim=self.fc_hid_dim,
+            act="softmax")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
 
     def forward(self, inputs):
         conv_3 = self._encoder(inputs)
@@ -69,11 +73,14 @@ class BOW(Model):
             padding_idx=None,
             bow_dim=self.hid_dim,
             seq_len=self.seq_len)
-        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim, act="tanh")
-        self._fc2 = Linear(input_dim = self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
-                                 output_dim = self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh")
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
 
     def forward(self, inputs):
         bow_1 = self._encoder(inputs)
@@ -94,10 +101,12 @@ class GRU(Model):
         self.class_dim = 2
         self.batch_size = batch_size
         self.seq_len = seq_len
-        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
-                                 output_dim=self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
         self._encoder = GRUEncoder(
             dict_size=self.dict_dim + 1,
             emb_dim=self.emb_dim,
@@ -112,7 +121,7 @@ class GRU(Model):
         prediction = self._fc_prediction(fc_1)
         return prediction
 
-        
+
 class BiGRU(Model):
     def __init__(self, dict_dim, batch_size, seq_len):
         super(BiGRU, self).__init__()
@@ -130,11 +139,13 @@ class BiGRU(Model):
             is_sparse=False)
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
         h_0 = to_variable(h_0)
-        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim*3)
-        self._fc2 = Linear(input_dim = self.hid_dim*2, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
-                                 output_dim=self.class_dim,
-                                 act="softmax")
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
         self._encoder = BiGRUEncoder(
             grnn_hidden_dim=self.hid_dim,
             input_dim=self.hid_dim * 3,
@@ -144,7 +155,8 @@ class BiGRU(Model):
 
     def forward(self, inputs):
         emb = self.embedding(inputs)
-        emb = fluid.layers.reshape(emb, shape=[self.batch_size, -1, self.hid_dim])
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
         fc_1 = self._fc1(emb)
         encoded_vector = self._encoder(fc_1)
         encoded_vector = fluid.layers.tanh(encoded_vector)
diff --git a/examples/transformer/transformer.py b/examples/transformer/transformer.py
index b2ec120..179dc17 100644
--- a/examples/transformer/transformer.py
+++ b/examples/transformer/transformer.py
@@ -21,7 +21,7 @@ import paddle.fluid.layers as layers
 from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 from hapi.model import Model, CrossEntropy, Loss
-from hapi.text import TransformerCell, TransformerBeamSearchDecoder, DynamicDecode
+from hapi.text import TransformerBeamSearchDecoder, DynamicDecode
 
 
 def position_encoding_init(n_position, d_pos_vec):
@@ -606,6 +606,27 @@ class Transformer(Model):
         return predict
 
 
+class TransformerCell(Layer):
+    """
+    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
+    used as RNNCell
+    """
+
+    def __init__(self, decoder):
+        super(TransformerCell, self).__init__()
+        self.decoder = decoder
+
+    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
+                static_caches):
+        trg_word, trg_pos = inputs
+        for cache, static_cache in zip(states, static_caches):
+            cache.update(static_cache)
+        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
+                              enc_output, states)
+        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+        return logits, new_states
+
+
 class InferTransformer(Transformer):
     """
     model for prediction
diff --git a/hapi/tests/test_text.py b/hapi/tests/test_text.py
index 46efbf6..eca5fda 100644
--- a/hapi/tests/test_text.py
+++ b/hapi/tests/test_text.py
@@ -25,8 +25,8 @@ from paddle.fluid.dygraph import Embedding, Linear, Layer
 from paddle.fluid.layers import BeamSearchDecoder
 import hapi.text as text
 from hapi.model import Model, Input, set_device
-from hapi.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder
-from hapi.text import *
+# from hapi.text.text import BasicLSTMCell, BasicGRUCell, RNN, DynamicDecode, MultiHeadAttention, TransformerEncoder, TransformerCell
+from hapi.text.text import *
 
 
 def sigmoid(x):
@@ -187,7 +187,7 @@ class TestBasicLSTM(ModuleApiTest):
             Input(
                 [None, None, self.inputs[-1].shape[-1]],
                 "float32",
-                name="input")
+                name="input"),
         ]
         return inputs
 
@@ -216,7 +216,7 @@ class TestBasicGRU(ModuleApiTest):
             Input(
                 [None, None, self.inputs[-1].shape[-1]],
                 "float32",
-                name="input")
+                name="input"),
         ]
         return inputs
 
@@ -270,10 +270,9 @@ class TestBeamSearch(ModuleApiTest):
             Input(
                 [None, self.inputs[0].shape[-1]],
                 "float32",
-                name="init_hidden"), Input(
-                    [None, self.inputs[1].shape[-1]],
-                    "float32",
-                    name="init_cell")
+                name="init_hidden"),
+            Input(
+                [None, self.inputs[1].shape[-1]], "float32", name="init_cell"),
         ]
         return inputs
 
@@ -328,10 +327,11 @@ class TestTransformerEncoder(ModuleApiTest):
             Input(
                 [None, None, self.inputs[0].shape[-1]],
                 "float32",
-                name="enc_input"), Input(
-                    [None, self.inputs[1].shape[1], None, None],
-                    "float32",
-                    name="attn_bias")
+                name="enc_input"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="attn_bias"),
         ]
         return inputs
 
@@ -395,16 +395,19 @@ class TestTransformerDecoder(TestTransformerEncoder):
             Input(
                 [None, None, self.inputs[0].shape[-1]],
                 "float32",
-                name="dec_input"), Input(
-                    [None, None, self.inputs[0].shape[-1]],
-                    "float32",
-                    name="enc_output"), Input(
-                        [None, self.inputs[-1].shape[1], None, None],
-                        "float32",
-                        name="self_attn_bias"), Input(
-                            [None, self.inputs[-1].shape[1], None, None],
-                            "float32",
-                            name="cross_attn_bias")
+                name="dec_input"),
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="self_attn_bias"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="cross_attn_bias"),
         ]
         return inputs
 
@@ -414,16 +417,21 @@ class TestTransformerDecoder(TestTransformerEncoder):
 
 class TestTransformerBeamSearchDecoder(ModuleApiTest):
     def setUp(self):
-        shape = (8, 32)
         self.inputs = [
-            np.random.random(shape).astype("float32"),
-            np.random.random(shape).astype("float32")
+            # encoder output: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 5, 128]).astype("float32"),
+            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9
         ]
         self.outputs = None
         self.attrs = {
             "vocab_size": 100,
-            "embed_dim": 32,
-            "hidden_size": 32,
+            "n_layer": 2,
+            "n_head": 2,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 128,
+            "d_inner_hid": 128
         }
         self.param_states = {}
 
@@ -445,13 +453,24 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest):
                    eos_id=1,
                    beam_size=4,
                    max_step_num=20):
-        embedder = Embedding(size=[vocab_size, d_model])
+        self.beam_size = beam_size
+
+        def embeder_init(self, size):
+            Layer.__init__(self)
+            self.embedder = Embedding(size)
+
+        Embedder = type("Embedder", (Layer, ), {
+            "__init__": embeder_init,
+            "forward": lambda self, word, pos: self.embedder(word)
+        })
+        embedder = Embedder(size=[vocab_size, d_model])
         output_layer = Linear(d_model, vocab_size)
-        decoder = TransformerDecoder(n_layer, n_head, d_key, d_value, d_model,
-                                     d_inner_hid, prepostprocess_dropout,
-                                     attention_dropout, relu_dropout,
-                                     preprocess_cmd, postprocess_cmd)
-        transformer_cell = TransformerCell(decoder)
+        self.decoder = TransformerDecoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd)
+        transformer_cell = TransformerCell(self.decoder, embedder,
+                                           output_layer)
         self.beam_search_decoder = DynamicDecode(
             TransformerBeamSearchDecoder(
                 transformer_cell,
@@ -464,23 +483,12 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest):
 
     @staticmethod
     def model_forward(self, enc_output, trg_src_attn_bias):
-        caches = [{
-            "k": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_key],
-                dtype=enc_output.dtype,
-                value=0),
-            "v": layers.fill_constant_batch_size_like(
-                input=enc_output,
-                shape=[-1, self.n_head, 0, self.d_value],
-                dtype=enc_output.dtype,
-                value=0),
-        } for i in range(self.n_layer)]
+        caches = self.decoder.prepare_incremental_cache(enc_output)
         enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
             enc_output, self.beam_size)
         trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
             trg_src_attn_bias, self.beam_size)
-        static_caches = self.decoder.decoder.prepare_static_cache(enc_output)
+        static_caches = self.decoder.prepare_static_cache(enc_output)
         rs, _ = self.beam_search_decoder(
             inits=caches,
             enc_output=enc_output,
@@ -491,12 +499,42 @@ class TestTransformerBeamSearchDecoder(ModuleApiTest):
     def make_inputs(self):
         inputs = [
             Input(
-                [None, self.inputs[0].shape[-1]],
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="trg_src_attn_bias"),
+        ]
+        return inputs
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceTagging(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 128)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 128, "hidden_size": 128}
+        self.param_states = {}
+
+    @staticmethod
+    def model_init(self, input_size, hidden_size):
+        self.module = SequenceTagging(input_size, hidden_size)
+
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.gru(inputs)[0]
+
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
                 "float32",
-                name="init_hidden"), Input(
-                    [None, self.inputs[1].shape[-1]],
-                    "float32",
-                    name="init_cell")
+                name="input"),
         ]
         return inputs
 
diff --git a/hapi/text/__init__.py b/hapi/text/__init__.py
index 2177ada..890e989 100644
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
@@ -28,6 +28,6 @@ from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearch
 from hapi.text.text import GRUCell as GRUCell
 from hapi.text.text import GRUEncoderCell as GRUEncoderCell
 from hapi.text.text import BiGRU as BiGRU
-from hapi.text.text import Linear_chain_crf as Linear_chain_crf
-from hapi.text.text import Crf_decoding as Crf_decoding
+from hapi.text.text import LinearChainCRF as LinearChainCRF
+from hapi.text.text import CRFDecoding as CRFDecoding
 from hapi.text.text import SequenceTagging as SequenceTagging
diff --git a/hapi/text/text.py b/hapi/text/text.py
index 0a382cd..8332700 100644
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -49,7 +49,7 @@ __all__ = [
     'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
     'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
     'TransformerDecoder', 'TransformerCell', 'TransformerBeamSearchDecoder',
-    'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging', 'GRUEncoderLayer'
+    'LinearChainCRF', 'CRFDecoding', 'SequenceTagging', 'GRUEncoder'
 ]
 
 
@@ -1008,18 +1008,38 @@ class TransformerCell(Layer):
     used as RNNCell
     """
 
-    def __init__(self, decoder):
+    def __init__(self, decoder, embedding_fn=None, output_fn=None):
+        super(TransformerCell, self).__init__()
         self.decoder = decoder
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
 
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
+    def forward(self, inputs, states, trg_src_attn_bias, enc_output,
+                static_caches):
         trg_word, trg_pos = inputs
         for cache, static_cache in zip(states, static_caches):
             cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
+        if self.embedding_fn is not None:
+            dec_input = self.embedding_fn(trg_word, trg_pos)
+            outputs = self.decoder(dec_input, enc_output, None,
+                                   trg_src_attn_bias, states)
+        else:
+            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
+                                   trg_src_attn_bias, states)
+        if self.output_fn is not None:
+            outputs = self.output_fn(outputs)
+        if len(outputs.shape) == 3:
+            # squeeze to adapt to BeamSearchDecoder which use 2D logits 
+            outputs = layers.squeeze(outputs, [1])
         new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        return [{
+            "k": [self.n_head, 0, self.d_key],
+            "v": [self.n_head, 0, self.d_value],
+        } for i in range(len(self.n_layer))]
 
 
 class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
@@ -1521,6 +1541,11 @@ class TransformerDecoder(Layer):
                  preprocess_cmd, postprocess_cmd):
         super(TransformerDecoder, self).__init__()
 
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+
         self.decoder_layers = list()
         for i in range(n_layer):
             self.decoder_layers.append(
@@ -1555,6 +1580,20 @@ class TransformerDecoder(Layer):
             for decoder_layer in self.decoder_layers
         ]
 
+    def prepare_incremental_cache(self, enc_output):
+        return [{
+            "k": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_key],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+
 
 #TODO: we should merge GRUCell with BasicGRUCell
 class GRUCell(RNNCell):
@@ -1651,9 +1690,9 @@ class BiGRU(fluid.dygraph.Layer):
         return bi_merge
 
 
-class Linear_chain_crf(fluid.dygraph.Layer):
+class LinearChainCRF(Layer):
     def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Linear_chain_crf, self).__init__()
+        super(LinearChainCRF, self).__init__()
 
         self._param_attr = param_attr
         self._dtype = dtype
@@ -1702,9 +1741,9 @@ class Linear_chain_crf(fluid.dygraph.Layer):
         return log_likelihood
 
 
-class Crf_decoding(fluid.dygraph.Layer):
+class CRFDecoding(Layer):
     def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Crf_decoding, self).__init__()
+        super(CRFDecoding, self).__init__()
 
         self._dtype = dtype
         self._size = size
@@ -1742,7 +1781,7 @@ class Crf_decoding(fluid.dygraph.Layer):
         return viterbi_path
 
 
-class GRUEncoderLayer(Layer):
+class GRUEncoder(Layer):
     def __init__(self,
                  input_dim,
                  grnn_hidden_dim,
@@ -1750,7 +1789,7 @@ class GRUEncoderLayer(Layer):
                  num_layers=1,
                  h_0=None,
                  is_bidirection=False):
-        super(GRUEncoderLayer, self).__init__()
+        super(GRUEncoder, self).__init__()
         self.h_0 = h_0
         self.num_layers = num_layers
         self.is_bidirection = is_bidirection
@@ -1849,7 +1888,7 @@ class SequenceTagging(fluid.dygraph.Layer):
             force_cpu=True,
             name='h_0')
 
-        self.gru_encoder = GRUEncoderLayer(
+        self.gru_encoder = GRUEncoder(
             input_dim=self.grnn_hidden_dim,
             grnn_hidden_dim=self.grnn_hidden_dim,
             init_bound=self.init_bound,
@@ -1866,12 +1905,12 @@ class SequenceTagging(fluid.dygraph.Layer):
                 regularizer=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=1e-4)))
 
-        self.linear_chain_crf = Linear_chain_crf(
+        self.linear_chain_crf = LinearChainCRF(
             param_attr=fluid.ParamAttr(
                 name='linear_chain_crfw', learning_rate=self.crf_lr),
             size=self.num_labels)
 
-        self.crf_decoding = Crf_decoding(
+        self.crf_decoding = CRFDecoding(
             param_attr=fluid.ParamAttr(
                 name='crfw', learning_rate=self.crf_lr),
             size=self.num_labels)
-- 
GitLab