Add hapi.text and corresponding unit test. (#24457)

* Add hapi.text and corresponding unit test. test=develop * Remove hapi.text apis' reuse parameter args for coverage. test=develop * Fix TransformerCell and TransformerBeamSearchDecoder example codes. test=develop * Fix example codes in hapi.text. test=develop * Add some apis in hapi.text into example code white list. test=develop * Fix example code of DynamicDecode in hapi.text. text=develop * Rename Model.self as model in test_text.py test=develop

Add hapi.text and corresponding unit test. (#24457)
* Add hapi.text and corresponding unit test. test=develop * Remove hapi.text apis' reuse parameter args for coverage. test=develop * Fix TransformerCell and TransformerBeamSearchDecoder example codes. test=develop * Fix example codes in hapi.text. test=develop * Add some apis in hapi.text into example code white list. test=develop * Fix example code of DynamicDecode in hapi.text. text=develop * Rename Model.self as model in test_text.py test=develop
aa02e347 · Guo Sheng · GitHub · 526a2117 · aa02e347 · aa02e347
6 changed file
--- a/python/paddle/incubate/hapi/__init__.py
+++ b/python/paddle/incubate/hapi/__init__.py
@@ -22,6 +22,7 @@ from . import loss
 from . import datasets
 from . import distributed
 from . import vision
+from . import text
 logger.setup_logger()
@@ -33,6 +34,7 @@ __all__ = [
    'metrics',
    'loss',
    'vision',
+    'text',
 ]
 __all__ += model.__all__
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import print_function
+import unittest
+import random
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, Linear, Layer
+from paddle.fluid.layers import BeamSearchDecoder
+from paddle.incubate.hapi.model import Model, Input, set_device
+from paddle.incubate.hapi.text import *
+class ModuleApiTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+        cls._random_seed = 123
+        np.random.seed(cls._random_seed)
+        random.seed(cls._random_seed)
+        cls.model_cls = type(cls.__name__ + "Model", (Model, ), {
+            "__init__": cls.model_init_wrapper(cls.model_init),
+            "forward": cls.model_forward
+        })
+    @classmethod
+    def tearDownClass(cls):
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+    @staticmethod
+    def model_init_wrapper(func):
+        def __impl__(self, *args, **kwargs):
+            Model.__init__(self)
+            func(self, *args, **kwargs)
+        return __impl__
+    @staticmethod
+    def model_init(model, *args, **kwargs):
+        raise NotImplementedError(
+            "model_init acts as `Model.__init__`, thus must implement it")
+    @staticmethod
+    def model_forward(model, *args, **kwargs):
+        return model.module(*args, **kwargs)
+    def make_inputs(self):
+        # TODO(guosheng): add default from `self.inputs`
+        raise NotImplementedError(
+            "model_inputs makes inputs for model, thus must implement it")
+    def setUp(self):
+        """
+        For the model which wraps the module to be tested:
+            Set input data by `self.inputs` list
+            Set init argument values by `self.attrs` list/dict
+            Set model parameter values by `self.param_states` dict
+            Set expected output data by `self.outputs` list
+        We can create a model instance and run once with these.
+        """
+        self.inputs = []
+        self.attrs = {}
+        self.param_states = {}
+        self.outputs = []
+    def _calc_output(self, place, mode="test", dygraph=True):
+        if dygraph:
+            fluid.enable_dygraph(place)
+        else:
+            fluid.disable_dygraph()
+        fluid.default_main_program().random_seed = self._random_seed
+        fluid.default_startup_program().random_seed = self._random_seed
+        model = self.model_cls(**self.attrs) if isinstance(
+            self.attrs, dict) else self.model_cls(*self.attrs)
+        model.prepare(inputs=self.make_inputs(), device=place)
+        if self.param_states:
+            model.load(self.param_states, optim_state=None)
+        return model.test_batch(self.inputs)
+    def check_output_with_place(self, place, mode="test"):
+        dygraph_output = self._calc_output(place, mode, dygraph=True)
+        stgraph_output = self._calc_output(place, mode, dygraph=False)
+        expect_output = getattr(self, "outputs", None)
+        for actual_t, expect_t in zip(dygraph_output, stgraph_output):
+            self.assertTrue(np.allclose(actual_t, expect_t, rtol=1e-5, atol=0))
+        if expect_output:
+            for actual_t, expect_t in zip(dygraph_output, expect_output):
+                self.assertTrue(
+                    np.allclose(
+                        actual_t, expect_t, rtol=1e-5, atol=0))
+    def check_output(self):
+        devices = ["CPU", "GPU"] if fluid.is_compiled_with_cuda() else ["CPU"]
+        for device in devices:
+            place = set_device(device)
+            self.check_output_with_place(place)
+class TestBasicLSTM(ModuleApiTest):
+    def setUp(self):
+        # TODO(guosheng): Change to big size. Currently bigger hidden size for
+        # LSTM would fail, the second static graph run might get diff output
+        # with others.
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model, input_size, hidden_size):
+        model.lstm = RNN(
+            BasicLSTMCell(
+                input_size,
+                hidden_size,
+                param_attr=fluid.ParamAttr(name="lstm_weight"),
+                bias_attr=fluid.ParamAttr(name="lstm_bias")))
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.lstm(inputs)[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestBasicGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 128)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 128, "hidden_size": 128}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model, input_size, hidden_size):
+        model.gru = RNN(BasicGRUCell(input_size, hidden_size))
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.gru(inputs)[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestBeamSearch(ModuleApiTest):
+    def setUp(self):
+        shape = (8, 32)
+        self.inputs = [
+            np.random.random(shape).astype("float32"),
+            np.random.random(shape).astype("float32")
+        ]
+        self.outputs = None
+        self.attrs = {
+            "vocab_size": 100,
+            "embed_dim": 32,
+            "hidden_size": 32,
+        }
+        self.param_states = {}
+    @staticmethod
+    def model_init(self,
+                   vocab_size,
+                   embed_dim,
+                   hidden_size,
+                   bos_id=0,
+                   eos_id=1,
+                   beam_size=4,
+                   max_step_num=20):
+        embedder = Embedding(size=[vocab_size, embed_dim])
+        output_layer = Linear(hidden_size, vocab_size)
+        cell = BasicLSTMCell(embed_dim, hidden_size)
+        decoder = BeamSearchDecoder(
+            cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=embedder,
+            output_fn=output_layer)
+        self.beam_search_decoder = DynamicDecode(
+            decoder, max_step_num=max_step_num, is_test=True)
+    @staticmethod
+    def model_forward(model, init_hidden, init_cell):
+        return model.beam_search_decoder([init_hidden, init_cell])[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, self.inputs[0].shape[-1]], "float32",
+                name="init_hidden"),
+            Input(
+                [None, self.inputs[1].shape[-1]], "float32", name="init_cell"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestTransformerEncoder(ModuleApiTest):
+    def setUp(self):
+        self.inputs = [
+            # encoder input: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 4, 512]).astype("float32"),
+            # self attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "n_layer": 2,
+            "n_head": 8,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 512,
+            "d_inner_hid": 1024
+        }
+        self.param_states = {}
+    @staticmethod
+    def model_init(model,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da",
+                   ffn_fc1_act="relu"):
+        model.encoder = TransformerEncoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd, ffn_fc1_act)
+    @staticmethod
+    def model_forward(model, enc_input, attn_bias):
+        return model.encoder(enc_input, attn_bias)
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_input"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="attn_bias"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestTransformerDecoder(TestTransformerEncoder):
+    def setUp(self):
+        self.inputs = [
+            # decoder input: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 4, 512]).astype("float32"),
+            # encoder output: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 5, 512]).astype("float32"),
+            # self attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 4]).astype("float32") * -1e9,
+            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 8, 4, 5]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "n_layer": 2,
+            "n_head": 8,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 512,
+            "d_inner_hid": 1024
+        }
+        self.param_states = {}
+    @staticmethod
+    def model_init(model,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da"):
+        model.decoder = TransformerDecoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd)
+    @staticmethod
+    def model_forward(model,
+                      dec_input,
+                      enc_output,
+                      self_attn_bias,
+                      cross_attn_bias,
+                      caches=None):
+        return model.decoder(dec_input, enc_output, self_attn_bias,
+                             cross_attn_bias, caches)
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="dec_input"),
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="self_attn_bias"),
+            Input(
+                [None, self.inputs[-1].shape[1], None, None],
+                "float32",
+                name="cross_attn_bias"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestTransformerBeamSearchDecoder(ModuleApiTest):
+    def setUp(self):
+        self.inputs = [
+            # encoder output: [batch_size, seq_len, hidden_size]
+            np.random.random([2, 5, 128]).astype("float32"),
+            # cross attention bias: [batch_size, n_head, seq_len, seq_len]
+            np.random.randint(0, 1, [2, 2, 1, 5]).astype("float32") * -1e9
+        ]
+        self.outputs = None
+        self.attrs = {
+            "vocab_size": 100,
+            "n_layer": 2,
+            "n_head": 2,
+            "d_key": 64,
+            "d_value": 64,
+            "d_model": 128,
+            "d_inner_hid": 128
+        }
+        self.param_states = {}
+    @staticmethod
+    def model_init(model,
+                   vocab_size,
+                   n_layer,
+                   n_head,
+                   d_key,
+                   d_value,
+                   d_model,
+                   d_inner_hid,
+                   prepostprocess_dropout=0.1,
+                   attention_dropout=0.1,
+                   relu_dropout=0.1,
+                   preprocess_cmd="n",
+                   postprocess_cmd="da",
+                   bos_id=0,
+                   eos_id=1,
+                   beam_size=4,
+                   max_step_num=20):
+        model.beam_size = beam_size
+        def embeder_init(self, size):
+            Layer.__init__(self)
+            self.embedder = Embedding(size)
+        Embedder = type("Embedder", (Layer, ), {
+            "__init__": embeder_init,
+            "forward": lambda self, word, pos: self.embedder(word)
+        })
+        embedder = Embedder(size=[vocab_size, d_model])
+        output_layer = Linear(d_model, vocab_size)
+        model.decoder = TransformerDecoder(
+            n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+            prepostprocess_dropout, attention_dropout, relu_dropout,
+            preprocess_cmd, postprocess_cmd)
+        transformer_cell = TransformerCell(model.decoder, embedder,
+                                           output_layer)
+        model.beam_search_decoder = DynamicDecode(
+            TransformerBeamSearchDecoder(
+                transformer_cell, bos_id, eos_id, beam_size,
+                var_dim_in_state=2),
+            max_step_num,
+            is_test=True)
+    @staticmethod
+    def model_forward(model, enc_output, trg_src_attn_bias):
+        caches = model.decoder.prepare_incremental_cache(enc_output)
+        enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+            enc_output, model.beam_size)
+        trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+            trg_src_attn_bias, model.beam_size)
+        static_caches = model.decoder.prepare_static_cache(enc_output)
+        rs, _ = model.beam_search_decoder(
+            inits=caches,
+            enc_output=enc_output,
+            trg_src_attn_bias=trg_src_attn_bias,
+            static_caches=static_caches)
+        return rs
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[0].shape[-1]],
+                "float32",
+                name="enc_output"),
+            Input(
+                [None, self.inputs[1].shape[1], None, None],
+                "float32",
+                name="trg_src_attn_bias"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestSequenceTagging(ModuleApiTest):
+    def setUp(self):
+        self.inputs = [
+            np.random.randint(0, 100, (2, 8)).astype("int64"),
+            np.random.randint(1, 8, (2)).astype("int64"),
+            np.random.randint(0, 5, (2, 8)).astype("int64")
+        ]
+        self.outputs = None
+        self.attrs = {"vocab_size": 100, "num_labels": 5}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model,
+                   vocab_size,
+                   num_labels,
+                   word_emb_dim=128,
+                   grnn_hidden_dim=128,
+                   emb_learning_rate=0.1,
+                   crf_learning_rate=0.1,
+                   bigru_num=2,
+                   init_bound=0.1):
+        model.tagger = SequenceTagging(vocab_size, num_labels, word_emb_dim,
+                                       grnn_hidden_dim, emb_learning_rate,
+                                       crf_learning_rate, bigru_num, init_bound)
+    @staticmethod
+    def model_forward(model, word, lengths, target=None):
+        return model.tagger(word, lengths, target)
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None], "int64", name="word"),
+            Input(
+                [None], "int64", name="lengths"),
+            Input(
+                [None, None], "int64", name="target"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestSequenceTaggingInfer(TestSequenceTagging):
+    def setUp(self):
+        super(TestSequenceTaggingInfer, self).setUp()
+        self.inputs = self.inputs[:2]  # remove target
+    def make_inputs(self):
+        inputs = super(TestSequenceTaggingInfer,
+                       self).make_inputs()[:2]  # remove target
+        return inputs
+class TestStackedRNN(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model, input_size, hidden_size, num_layers):
+        cells = [
+            BasicLSTMCell(input_size, hidden_size),
+            BasicLSTMCell(hidden_size, hidden_size)
+        ]
+        stacked_cell = StackedRNNCell(cells)
+        model.lstm = RNN(stacked_cell)
+    @staticmethod
+    def model_forward(self, inputs):
+        return self.lstm(inputs)[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestLSTM(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model, input_size, hidden_size, num_layers):
+        model.lstm = LSTM(input_size, hidden_size, num_layers=num_layers)
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.lstm(inputs)[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestBiLSTM(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 16)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 16, "hidden_size": 16, "num_layers": 2}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model,
+                   input_size,
+                   hidden_size,
+                   num_layers,
+                   merge_mode="concat",
+                   merge_each_layer=False):
+        model.bilstm = BidirectionalLSTM(
+            input_size,
+            hidden_size,
+            num_layers=num_layers,
+            merge_mode=merge_mode,
+            merge_each_layer=merge_each_layer)
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.bilstm(inputs)[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output_merge0(self):
+        self.check_output()
+    def test_check_output_merge1(self):
+        self.attrs["merge_each_layer"] = True
+        self.check_output()
+class TestGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 64)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model, input_size, hidden_size, num_layers):
+        model.gru = GRU(input_size, hidden_size, num_layers=num_layers)
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.gru(inputs)[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+class TestBiGRU(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 4, 64)
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"input_size": 64, "hidden_size": 128, "num_layers": 2}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model,
+                   input_size,
+                   hidden_size,
+                   num_layers,
+                   merge_mode="concat",
+                   merge_each_layer=False):
+        model.bigru = BidirectionalGRU(
+            input_size,
+            hidden_size,
+            num_layers=num_layers,
+            merge_mode=merge_mode,
+            merge_each_layer=merge_each_layer)
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.bigru(inputs)[0]
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, None, self.inputs[-1].shape[-1]],
+                "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output_merge0(self):
+        self.check_output()
+    def test_check_output_merge1(self):
+        self.attrs["merge_each_layer"] = True
+        self.check_output()
+class TestCNNEncoder(ModuleApiTest):
+    def setUp(self):
+        shape = (2, 32, 8)  # [N, C, H]
+        self.inputs = [np.random.random(shape).astype("float32")]
+        self.outputs = None
+        self.attrs = {"num_channels": 32, "num_filters": 64, "num_layers": 2}
+        self.param_states = {}
+    @staticmethod
+    def model_init(model, num_channels, num_filters, num_layers):
+        model.cnn_encoder = CNNEncoder(
+            num_layers=2,
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=[2, 3],
+            pool_size=[7, 6])
+    @staticmethod
+    def model_forward(model, inputs):
+        return model.cnn_encoder(inputs)
+    def make_inputs(self):
+        inputs = [
+            Input(
+                [None, self.inputs[-1].shape[1], None], "float32",
+                name="input"),
+        ]
+        return inputs
+    def test_check_output(self):
+        self.check_output()
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/incubate/hapi/text/__init__.py
+++ b/python/paddle/incubate/hapi/text/__init__.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import text
+from .text import *
+__all__ = text.__all__
--- a/python/paddle/incubate/hapi/text/text.py
+++ b/python/paddle/incubate/hapi/text/text.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import collections
+import six
+import sys
+from functools import partial, reduce
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers.utils as utils
+from paddle.fluid import layers
+from paddle.fluid.layers import BeamSearchDecoder
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
+from paddle.fluid.data_feeder import convert_dtype
+__all__ = [
+    'RNNCell',
+    'BasicLSTMCell',
+    'BasicGRUCell',
+    'RNN',
+    'BidirectionalRNN',
+    'StackedRNNCell',
+    'StackedLSTMCell',
+    'LSTM',
+    'BidirectionalLSTM',
+    'StackedGRUCell',
+    'GRU',
+    'BidirectionalGRU',
+    'DynamicDecode',
+    'BeamSearchDecoder',
+    'Conv1dPoolLayer',
+    'CNNEncoder',
+    'MultiHeadAttention',
+    'FFN',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'TransformerCell',
+    'TransformerBeamSearchDecoder',
+    'LinearChainCRF',
+    'CRFDecoding',
+    'SequenceTagging',
+]
+class RNNCell(Layer):
+    """
+    RNNCell is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+    def get_initial_states(self,
+                           batch_ref,
+                           shape=None,
+                           dtype=None,
+                           init_value=0,
+                           batch_dim_idx=0):
+        """
+        Generate initialized states according to provided shape, data type and
+        value.
+        Parameters:
+            batch_ref: A (possibly nested structure of) tensor variable[s].
+                The first dimension of the tensor will be used as batch size to
+                initialize states.
+            shape: A (possibly nested structure of) shape[s], where a shape is
+                represented as a list/tuple of integer). -1(for batch size) will
+                beautomatically inserted if shape is not started with it. If None,
+                property `state_shape` will be used. The default value is None.
+            dtype: A (possibly nested structure of) data type[s]. The structure
+                must be same as that of `shape`, except when all tensors' in states
+                has the same data type, a single data type can be used. If None and
+                property `cell.state_shape` is not available, float32 will be used
+                as the data type. The default value is None.
+            init_value: A float value used to initialize states.
+            batch_dim_idx: An integer indicating which dimension of the tensor in
+                inputs represents batch size.  The default value is 0.
+        Returns:
+            Variable: tensor variable[s] packed in the same structure provided \
+                by shape, representing the initialized states.
+        """
+        # TODO: use inputs and batch_size
+        batch_ref = flatten(batch_ref)[0]
+        def _is_shape_sequence(seq):
+            if sys.version_info < (3, ):
+                integer_types = (
+                    int,
+                    long, )
+            else:
+                integer_types = (int, )
+            """For shape, list/tuple of integer is the finest-grained objection"""
+            if (isinstance(seq, list) or isinstance(seq, tuple)):
+                if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
+                          seq, True):
+                    return False
+            # TODO: Add check for the illegal
+            if isinstance(seq, dict):
+                return True
+            return (isinstance(seq, collections.Sequence) and
+                    not isinstance(seq, six.string_types))
+        class Shape(object):
+            def __init__(self, shape):
+                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+        # nested structure of shapes
+        states_shapes = self.state_shape if shape is None else shape
+        is_sequence_ori = utils.is_sequence
+        utils.is_sequence = _is_shape_sequence
+        states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
+        utils.is_sequence = is_sequence_ori
+        # nested structure of dtypes
+        try:
+            states_dtypes = self.state_dtype if dtype is None else dtype
+        except NotImplementedError:  # use fp32 as default
+            states_dtypes = "float32"
+        if len(flatten(states_dtypes)) == 1:
+            dtype = flatten(states_dtypes)[0]
+            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
+        init_states = map_structure(
+            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
+                input=batch_ref,
+                shape=shape.shape,
+                dtype=dtype,
+                value=init_value,
+                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+        return init_states
+    @property
+    def state_shape(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) shape[s], where a shape is represented
+        as a list/tuple of integers (-1 for batch size would be automatically
+        inserted into a shape if shape is not started with it).
+        Not necessary to be implemented if states are not initialized by
+        `get_initial_states` or the `shape` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_shape` in the used cell.")
+    @property
+    def state_dtype(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) data types[s]. The structure must be
+        same as that of `shape`, except when all tensors' in states has the same
+        data type, a signle data type can be used.
+        Not necessary to be implemented if states are not initialized
+        by `get_initial_states` or the `dtype` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_dtype` in the used cell.")
+class BasicLSTMCell(RNNCell):
+    """
+    Long-Short Term Memory(LSTM) RNN cell.
+    The formula used is as follows:
+    .. math::
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+        h_{t} & = o_{t} act_c (c_{t})
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+    Parameters:
+        input_size (int): The input size in the LSTM cell.
+        hidden_size (int): The hidden size in the LSTM cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias(float, optional): forget bias used when computing forget gate.
+            Default 1.0
+        dtype(string, optional): The data type used in this cell. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicLSTMCell, RNN
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 dtype='float32'):
+        super(BasicLSTMCell, self).__init__()
+        self._hidden_size = hidden_size
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._gate_activation = gate_activation or layers.sigmoid
+        self._activation = activation or layers.tanh
+        # TODO(guosheng): find better way to resolve constants in __init__
+        self._forget_bias = layers.create_global_var(
+            shape=[1], dtype=dtype, value=forget_bias, persistable=True)
+        self._forget_bias.stop_gradient = True
+        self._dtype = dtype
+        self._input_size = input_size
+        self._weight = self.create_parameter(
+            attr=self._param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 4 * self._hidden_size
+            ],
+            dtype=self._dtype)
+        self._bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[4 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+    def forward(self, inputs, states):
+        """
+        Performs single step LSTM calculations.
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A list of containing two tensors, each shaped
+                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
+                in the formula. The data type should be float32 or float64.
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula; `new_states` is a list containing \
+                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
+                tensors all is same as that of `states`.
+        """
+        pre_hidden, pre_cell = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], 1)
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = layers.elementwise_add(gate_input, self._bias)
+        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
+        new_cell = layers.elementwise_add(
+            layers.elementwise_mul(
+                pre_cell,
+                self._gate_activation(
+                    layers.elementwise_add(f, self._forget_bias))),
+            layers.elementwise_mul(
+                self._gate_activation(i), self._activation(j)))
+        new_hidden = self._activation(new_cell) * self._gate_activation(o)
+        return new_hidden, [new_hidden, new_cell]
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
+        (-1 for batch size would be automatically inserted into shape). These two
+        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
+        return [[self._hidden_size], [self._hidden_size]]
+class BasicGRUCell(RNNCell):
+    """
+    Gated Recurrent Unit (GRU) RNN cell.
+    The formula for GRU used is as follows:
+    .. math::
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        dtype(string, optional): The data type used in this cell. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicGRUCell, RNN
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation=None,
+                 activation=None,
+                 dtype='float32'):
+        super(BasicGRUCell, self).__init__()
+        self._input_size = input_size
+        self._hidden_size = hidden_size
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._gate_activation = gate_activation or layers.sigmoid
+        self._activation = activation or layers.tanh
+        self._dtype = dtype
+        if self._param_attr is not None and self._param_attr.name is not None:
+            gate_param_attr = copy.deepcopy(self._param_attr)
+            candidate_param_attr = copy.deepcopy(self._param_attr)
+            gate_param_attr.name += "_gate"
+            candidate_param_attr.name += "_candidate"
+        else:
+            gate_param_attr = self._param_attr
+            candidate_param_attr = self._param_attr
+        self._gate_weight = self.create_parameter(
+            attr=gate_param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 2 * self._hidden_size
+            ],
+            dtype=self._dtype)
+        self._candidate_weight = self.create_parameter(
+            attr=candidate_param_attr,
+            shape=[self._input_size + self._hidden_size, self._hidden_size],
+            dtype=self._dtype)
+        if self._bias_attr is not None and self._bias_attr.name is not None:
+            gate_bias_attr = copy.deepcopy(self._bias_attr)
+            candidate_bias_attr = copy.deepcopy(self._bias_attr)
+            gate_bias_attr.name += "_gate"
+            candidate_bias_attr.name += "_candidate"
+        else:
+            gate_bias_attr = self._bias_attr
+            candidate_bias_attr = self._bias_attr
+        self._gate_bias = self.create_parameter(
+            attr=gate_bias_attr,
+            shape=[2 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+        self._candidate_bias = self.create_parameter(
+            attr=candidate_bias_attr,
+            shape=[self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+    def forward(self, inputs, states):
+        """
+        Performs single step GRU calculations.
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A tensor with shape `[batch_size, hidden_size]`.
+                corresponding to :math:`h_{t-1}` in the formula. The data type
+                should be float32 or float64.
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
+                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
+                corresponding to :math:`h_t` in the formula. The data type of the \
+                tensor is same as that of `states`.        
+        """
+        pre_hidden = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1)
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
+        gate_input = self._gate_activation(gate_input)
+        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
+        r_hidden = r * pre_hidden
+        candidate = layers.matmul(
+            layers.concat([inputs, r_hidden], 1), self._candidate_weight)
+        candidate = layers.elementwise_add(candidate, self._candidate_bias)
+        c = self._activation(candidate)
+        new_hidden = u * pre_hidden + (1 - u) * c
+        return new_hidden, new_hidden
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to :math:`h_{t-1}`.
+        """
+        return [self._hidden_size]
+class RNN(Layer):
+    """
+    RNN creates a recurrent neural network specified by RNNCell `cell`, which
+    performs :code:`cell.forward()` repeatedly until reaches to the maximum
+    length of `inputs`.
+    Parameters:
+        cell(RNNCell): An instance of `RNNCell`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
+                                                                            1)
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
+        length of `inputs`.
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in RNN.
+            initial_states (Variable, optional): A (possibly nested structure of)
+                tensor variable[s], representing the initial state for RNN. 
+                If not provided, `cell.get_initial_states` would be used to produce
+                the initial state. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
+                outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as \
+                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
+                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
+                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
+                `final_states` is the counterpart at last time step of initial states, \
+                thus has the same structure with it and has tensors with same shapes \
+                and data types.
+        """
+        if fluid.in_dygraph_mode():
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
+                def append(self, x):
+                    self.array.append(x)
+                    return self
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                new_state = fluid.layers.elementwise_mul(
+                    new_state, step_mask,
+                    axis=0) - fluid.layers.elementwise_mul(
+                        state, (step_mask - 1), axis=0)
+                return new_state
+            flat_inputs = flatten(inputs)
+            batch_size, time_steps = (
+                flat_inputs[0].shape[self.batch_index],
+                flat_inputs[0].shape[self.time_step_index])
+            if initial_states is None:
+                initial_states = self.cell.get_initial_states(
+                    batch_ref=inputs, batch_dim_idx=self.batch_index)
+            if not self.time_major:
+                inputs = map_structure(
+                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), inputs)
+            if sequence_length is not None:
+                mask = fluid.layers.sequence_mask(
+                    sequence_length,
+                    maxlen=time_steps,
+                    dtype=flatten(initial_states)[0].dtype)
+                mask = fluid.layers.transpose(mask, [1, 0])
+            if self.is_reverse:
+                inputs = map_structure(
+                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
+                mask = fluid.layers.reverse(
+                    mask, axis=[0]) if sequence_length is not None else None
+            states = initial_states
+            outputs = []
+            for i in range(time_steps):
+                step_inputs = map_structure(lambda x: x[i], inputs)
+                step_outputs, new_states = self.cell(step_inputs, states,
+                                                     **kwargs)
+                if sequence_length is not None:
+                    new_states = map_structure(
+                        partial(
+                            _maybe_copy, step_mask=mask[i]),
+                        states,
+                        new_states)
+                states = new_states
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if i == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+            final_outputs = map_structure(
+                lambda x: fluid.layers.stack(x.array, axis=self.time_step_index
+                                             ), outputs)
+            if self.is_reverse:
+                final_outputs = map_structure(
+                    lambda x: fluid.layers.reverse(x, axis=self.time_step_index
+                                                   ), final_outputs)
+            final_states = new_states
+        else:
+            final_outputs, final_states = fluid.layers.rnn(
+                self.cell,
+                inputs,
+                initial_states=initial_states,
+                sequence_length=sequence_length,
+                time_major=self.time_major,
+                is_reverse=self.is_reverse,
+                **kwargs)
+        return final_outputs, final_states
+class StackedRNNCell(RNNCell):
+    """
+    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
+    to implement stacked RNNs.
+    Parameters:
+        cells (list|tuple): List of RNN cell instances.
+    Examples:
+        .. code-block:: python
+            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
+            cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
+            stack_rnn = StackedRNNCell(cells)
+    """
+    def __init__(self, cells):
+        super(StackedRNNCell, self).__init__()
+        self.cells = []
+        for i, cell in enumerate(cells):
+            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+    def forward(self, inputs, states, **kwargs):
+        """
+        Performs :code:`cell.forward` for all including cells sequentially.
+        Each cell's `inputs` is the `outputs` of the previous cell. And each
+        cell's `states` is the corresponding one in `states`.
+        Parameters:
+            inputs (Variable): The inputs for the first cell. Mostly it is a
+                float32 or float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
+                `outputs` of the last cell. `new_states` is a list composed \
+                of all cells' `new_states`, and its structure and data type is \
+                same as that of `states` argument.
+        """
+        new_states = []
+        for cell, state in zip(self.cells, states):
+            outputs, new_state = cell(inputs, state, **kwargs)
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        """
+        If `param_attr` is a list or tuple, convert every element in it to a
+        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+        construct a list, and rename every one by appending a increasing index
+        suffix to avoid having same names when `param_attr` contains a name.
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`.
+            n (int): The times to repeat to construct a list when `param_attr`
+                is not a list or tuple.
+        Returns:
+            list: A list composed of each including cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" % n)
+            param_attrs = [
+                fluid.ParamAttr._to_attr(attr) for attr in param_attr
+            ]
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedRNNCell is a list composed of each including
+        cell's `state_shape`.
+        Returns:
+            list: A list composed of each including cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+class StackedLSTMCell(RNNCell):
+    """
+    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
+    to implement stacked LSTM.
+    The formula for LSTM used here is as follows:
+    .. math::
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+        h_{t} & = o_{t} act_c (c_{t})
+    Parameters:
+        input_size (int): The input size for the first LSTM cell.
+        hidden_size (int): The hidden size for every LSTM cell.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+        self.cells = []
+        for i in range(num_layers):
+            if forget_bias is True:
+                bias_attrs[
+                    i].initializer = fluid.initializer.NumpyArrayInitializer(
+                        np.concatenate(
+                            np.zeros(2 * hidden_size),
+                            np.ones(hidden_size), np.zeros(hidden_size)).astype(
+                                dtype))
+                forget_bias = 0.0
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    BasicLSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        forget_bias=forget_bias,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+    def forward(self, inputs, states):
+        """
+        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
+                is a list composed of every LSTM `new_states` which is a pair \
+                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
+                and the data type and structure of these tensors all is same \
+                as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedLSTMCell is a list composed of each including
+        LSTM cell's `state_shape`.
+        Returns:
+            list: A list composed of each including LSTM cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+class LSTM(Layer):
+    """
+    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
+    The formula for LSTM used here is as follows:
+    .. math::
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+        h_{t} & = o_{t} act_c (c_{t})
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+            inputs = paddle.rand((2, 4, 32))
+            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = lstm(inputs)  # [2, 4, 64]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(LSTM, self).__init__()
+        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                    activation, forget_bias, num_layers,
+                                    dropout, param_attr, bias_attr, dtype)
+        self.lstm = RNN(lstm_cell, is_reverse, time_major)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one.
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM, and the initial states of each LSTM is a pair
+                of tensors shaped `[batch_size, hidden_size]`. If not provided,
+                use 0 as initial states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last LSTM and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types. 
+        """
+        return self.lstm(inputs, initial_states, sequence_length)
+class BidirectionalRNN(Layer):
+    """
+    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
+    forward and backward RNN separately, and merge outputs of these two RNN
+    according to `merge_mode`.
+    Parameters:
+        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
+        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
+            inputs = paddle.rand((2, 4, 32))
+            cell_fw = StackedLSTMCell(32, 64)
+            cell_bw = StackedLSTMCell(32, 64)
+            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
+            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
+    """
+    def __init__(self,
+                 cell_fw,
+                 cell_bw,
+                 merge_mode='concat',
+                 time_major=False,
+                 cell_cls=None,
+                 **kwargs):
+        super(BidirectionalRNN, self).__init__()
+        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
+        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
+        if merge_mode == 'concat':
+            self.merge_func = lambda x, y: layers.concat([x, y], -1)
+        elif merge_mode == 'sum':
+            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
+        elif merge_mode == 'ave':
+            self.merge_func = lambda x, y: layers.scale(
+                layers.elementwise_add(x, y), 0.5)
+        elif merge_mode == 'mul':
+            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
+        elif merge_mode == 'zip':
+            self.merge_func = lambda x, y: (x, y)
+        elif merge_mode is None:
+            self.merge_func = None
+        else:
+            raise ValueError('Unsupported value for `merge_mode`: %s' %
+                             merge_mode)
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs forward and backward RNN separately, and merge outputs of these
+        two RNN according to `merge_mode`.
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in both forward and backward RNN.
+            initial_states (Variable|list|tuple): If it is a list or tuple, its
+                length should be 2 to include initial states of forward and backward
+                RNN separately. Otherwise it would be used twice for the two RNN. 
+                If None, `cell.get_initial_states` would be used to produce the initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is produced by merge outputs of forward and backward RNN according \
+                to `merge_mode`, `final_states` is a pair including `final_states` \
+                of forward and backward RNN.
+        """
+        if isinstance(initial_states, (list, tuple)):
+            assert len(
+                initial_states
+            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
+                                            sequence_length, **kwargs)
+        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
+                                            sequence_length, **kwargs)
+        outputs = map_structure(self.merge_func, outputs_fw,
+                                outputs_bw) if self.merge_func else (outputs_fw,
+                                                                     outputs_bw)
+        return outputs, (states_fw, states_bw)
+    @staticmethod
+    def bidirect_param_attr(param_attr):
+        """
+        Converts `param_attr` to a pair of `param_attr` when it is not a list
+        or tuple with length 2, also rename every one by appending a suffix to
+        avoid having same names when `param_attr` contains a name.
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
+                it is a list or tuple, its length must be 2.
+        Returns:
+            list: A pair composed of forward and backward RNN cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(
+                param_attr
+            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
+            param_attrs = param_attr
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            attr_fw = copy.deepcopy(attr)
+            if attr.name:
+                attr_fw.name = attr_fw.name + "_fw"
+            param_attrs.append(attr_fw)
+            attr_bw = copy.deepcopy(attr)
+            if attr.name:
+                attr_bw.name = attr_bw.name + "_bw"
+            param_attrs.append(attr_bw)
+        return param_attrs
+class BidirectionalLSTM(Layer):
+    """
+    Applies a bidirectional multi-layer long short-term memory (LSTM) RNN to an
+    input sequence. 
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+    The formula for LSTM used here is as follows:
+    .. math::
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+        h_{t} & = o_{t} act_c (c_{t})
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalLSTM
+            inputs = paddle.rand((2, 4, 32))
+            bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_lstm(inputs)  # [2, 4, 128]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalLSTM, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[0], bias_attrs[0],
+                                      dtype)
+            cell_bw = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                      activation, forget_bias, num_layers,
+                                      dropout, param_attrs[1], bias_attrs[1],
+                                      dtype)
+            self.lstm = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
+            # maybe design cell including both forward and backward later
+            self.lstm = []
+            for i in range(num_layers):
+                cell_fw = StackedLSTMCell(
+                    input_size
+                    if i == 0 else (hidden_size * 2
+                                    if merge_mode == 'concat' else hidden_size),
+                    hidden_size, gate_activation, activation, forget_bias, 1,
+                    dropout, fw_param_attrs[i], fw_bias_attrs[i], dtype)
+                cell_bw = StackedLSTMCell(
+                    input_size
+                    if i == 0 else (hidden_size * 2
+                                    if merge_mode == 'concat' else hidden_size),
+                    hidden_size, gate_activation, activation, forget_bias, 1,
+                    dropout, bw_param_attrs[i], bw_bias_attrs[i], dtype)
+                self.lstm.append(
+                    self.add_sublayer(
+                        "lstm_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional LSTM; `final_states` is a \
+                pair including `final_states` of forward and backward LSTM when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional LSTM, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.lstm(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.lstm[i](inputs, initial_states[i],
+                                               sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
+class StackedGRUCell(RNNCell):
+    """
+    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
+    to implement stacked GRU.
+    The formula for GRU used here is as follows:
+    .. math::
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedGRUCell, RNN
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedGRUCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    BasicGRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+    def forward(self, inputs, states):
+        """
+        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
+                is a list composed of every GRU `new_states` which is also \
+                :math:`h_{t}` in the formula, and the data type and structure \
+                of these tensors all is same as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedGRUCell is a list composed of each including
+        GRU cell's `state_shape`.
+        Returns:
+            list: A list composed of each including GRU cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+class GRU(Layer):
+    """
+    Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+    The formula for GRU used here is as follows:
+    .. math::
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+    Parameters:
+        input_size (int): The input feature size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import GRU
+            inputs = paddle.rand((2, 4, 32))
+            gru = GRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = gru(inputs)  # [2, 4, 64]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(GRU, self).__init__()
+        gru_cell = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                  activation, num_layers, dropout, param_attr,
+                                  bias_attr, dtype)
+        self.gru = RNN(gru_cell, is_reverse, time_major)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one.
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU, and the initial states of each GRU is a tensor
+                shaped `[batch_size, hidden_size]`. If not provided, use 0 as initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last GRU and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types.
+        """
+        return self.gru(inputs, initial_states, sequence_length)
+class BidirectionalGRU(Layer):
+    """
+    Applies a bidirectional multi-layer gated recurrent unit (GRU) RNN to an input
+    sequence.
+    Bidirection interaction can happen after each layer or only after the last
+    layer according to the  `merge_each_layer` setting. The way to interact,
+    that is how to merge outputs of the two direction, is determined by `merge_mode`.
+    The formula for GRU used here is as follows:
+    .. math::
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+    Parameters:
+        input_size (int): The input feature size  for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            GRU. It also can be a list or tuple, including dropout probabilities
+            for the corresponding GRU. Default 0.0
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+        merge_each_layer (bool, optional): Indicate whether bidirection interaction
+            happens after each layer or only after the last layer. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BidirectionalGRU
+            inputs = paddle.rand((2, 4, 32))
+            bi_gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = bi_gru(inputs)  # [2, 4, 128]
+    """
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 merge_mode='concat',
+                 merge_each_layer=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(BidirectionalGRU, self).__init__()
+        self.num_layers = num_layers
+        self.merge_mode = merge_mode
+        self.merge_each_layer = merge_each_layer
+        param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+        bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+        if not merge_each_layer:
+            cell_fw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[0], bias_attrs[0], dtype)
+            cell_bw = StackedGRUCell(input_size, hidden_size, gate_activation,
+                                     activation, num_layers, dropout,
+                                     param_attrs[1], bias_attrs[1], dtype)
+            self.gru = BidirectionalRNN(
+                cell_fw, cell_bw, merge_mode=merge_mode, time_major=time_major)
+        else:
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
+            # maybe design cell including both forward and backward later
+            self.gru = []
+            for i in range(num_layers):
+                cell_fw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, fw_param_attrs[i],
+                                         fw_bias_attrs[i], dtype)
+                cell_bw = StackedGRUCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, gate_activation, activation, 1,
+                                         dropout, bw_param_attrs[i],
+                                         bw_bias_attrs[i], dtype)
+                self.gru.append(
+                    self.add_sublayer(
+                        "gru_%d" % i,
+                        BidirectionalRNN(
+                            cell_fw,
+                            cell_bw,
+                            merge_mode=merge_mode,
+                            time_major=time_major)))
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs bidirectional multi-layer GRU layer by layer. Each GRU's `outputs`
+        is the `inputs` of the subsequent one, or when `merge_each_layer` is True,
+        merged outputs would be the `inputs` of the subsequent one.
+        Parameters:
+            inputs (Variable): The inputs for the first GRU. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked GRU. If `merge_each_layer` is True, the length of
+                list should be `num_layers` and a single value would be reused for
+                `num_layers`; Otherwise, the length should be 2 and a single value
+                would be reused twice. If not provided, use 0 as initial states.
+                Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last bidirectional GRU; `final_states` is a \
+                pair including `final_states` of forward and backward GRU when \
+                `merge_each_layer` is False or a list including `final_states` \
+                of all stacked bidirectional GRU, and it has tensors with same \
+                shapes data types as `initial_states`.
+        """
+        if not self.merge_each_layer:
+            return self.gru(inputs, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list/tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                outputs, states = self.gru[i](inputs, initial_states[i],
+                                              sequence_length)
+                inputs = outputs
+                stacked_states.append(states)
+            return outputs, stacked_states
+class DynamicDecode(Layer):
+    """
+    DynamicDecode integrates an Decoder instance to perform dynamic decoding.
+    It performs :code:`decoder.step()` repeatedly until the returned Tensor
+    indicating finished status contains all True values or the number of
+    decoding step reaches to :attr:`max_step_num`.
+    :code:`decoder.initialize()` would be called once before the decoding loop.
+    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+    would be called once after the decoding loop.
+    Parameters:
+        decoder (Decoder): An instance of `Decoder`.
+        max_step_num (int, optional): The maximum number of steps. If not provided,
+            decode until the decoder is fully done, or in other words, the returned
+            Tensor by :code:`decoder.step()` indicating finished status contains
+            all True. Default `None`.
+        output_time_major (bool, optional): Indicate the data layout of Tensor included
+            in the final outputs(the first returned value of this method). If
+            attr:`False`, the data layout would be batch major with shape
+            `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
+            be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
+        impute_finished (bool, optional): If `True`, then states get copied through
+            for batch entries which are marked as finished, which differs with the
+            unfinished using the new states returned by :code:`decoder.step()` and
+            ensures that the final states have the correct values. Otherwise, states
+            wouldn't be copied through when finished. If the returned `final_states`
+            is needed, it should be set as True, which causes some slowdown.
+            Default `False`.
+        is_test (bool, optional): A flag indicating whether to use test mode. In
+            test mode, it is more memory saving. Default `False`.
+        return_length (bool, optional):  A flag indicating whether to return an
+            extra Tensor variable in the output tuple, which stores the actual
+            lengths of all decoded sequences. Default `False`.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.layers import BeamSearchDecoder
+            from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
+            paddle.enable_dygraph()
+            vocab_size, d_model, = 100, 32
+            encoder_output = paddle.rand((2, 4, d_model))
+            trg_embeder = fluid.dygraph.Embedding(size=[vocab_size, d_model])
+            output_layer = fluid.dygraph.Linear(d_model, vocab_size)
+            cell = StackedLSTMCell(input_size=d_model, hidden_size=d_model)
+            decoder = BeamSearchDecoder(cell,
+                                        start_token=0,
+                                        end_token=1,
+                                        beam_size=4,
+                                        embedding_fn=trg_embeder,
+                                        output_fn=output_layer)
+            dynamic_decoder = DynamicDecode(decoder, max_step_num=10)
+            outputs = dynamic_decoder(cell.get_initial_states(encoder_output))
+    """
+    def __init__(self,
+                 decoder,
+                 max_step_num=None,
+                 output_time_major=False,
+                 impute_finished=False,
+                 is_test=False,
+                 return_length=False):
+        super(DynamicDecode, self).__init__()
+        self.decoder = decoder
+        self.max_step_num = max_step_num
+        self.output_time_major = output_time_major
+        self.impute_finished = impute_finished
+        self.is_test = is_test
+        self.return_length = return_length
+    def forward(self, inits=None, **kwargs):
+        """
+        Performs :code:`decoder.step()` repeatedly until the returned Tensor
+        indicating finished status contains all True values or the number of
+        decoding step reaches to :attr:`max_step_num`.
+        :code:`decoder.initialize()` would be called once before the decoding loop.
+        If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
+        would be called once after the decoding loop.
+        Parameters:
+            inits (object, optional): Argument passed to `decoder.initialize`.
+                Default `None`.
+            **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
+                when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
+                The final outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as the :code:`outputs` \
+                returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
+                is the stacked of all decoding steps' outputs, which might be revised \
+                by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
+                `final_states` is the counterpart at last time step of initial states \
+                returned by :code:`decoder.initialize()` , thus has the same structure \
+                with it and has tensors with same shapes and data types. `sequence_lengths` \
+                is an `int64` tensor with the same shape as `finished` returned \
+                by :code:`decoder.initialize()` , and it stores the actual lengths of \
+                all decoded sequences.
+        """
+        if fluid.in_dygraph_mode():
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
+                def append(self, x):
+                    self.array.append(x)
+                    return self
+                def __getitem__(self, item):
+                    return self.array.__getitem__(item)
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                state_dtype = state.dtype
+                if convert_dtype(state_dtype) in ["bool"]:
+                    state = layers.cast(state, dtype="float32")
+                    new_state = layers.cast(new_state, dtype="float32")
+                if step_mask.dtype != state.dtype:
+                    step_mask = layers.cast(step_mask, dtype=state.dtype)
+                    # otherwise, renamed bool gradients of would be summed up leading
+                    # to sum(bool) error.
+                    step_mask.stop_gradient = True
+                new_state = layers.elementwise_mul(
+                    state, step_mask, axis=0) - layers.elementwise_mul(
+                        new_state, (step_mask - 1), axis=0)
+                if convert_dtype(state_dtype) in ["bool"]:
+                    new_state = layers.cast(new_state, dtype=state_dtype)
+                return new_state
+            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
+                inits)
+            inputs, states, finished = (initial_inputs, initial_states,
+                                        initial_finished)
+            cond = layers.logical_not((layers.reduce_all(initial_finished)))
+            sequence_lengths = layers.cast(
+                layers.zeros_like(initial_finished), "int64")
+            outputs = None
+            step_idx = 0
+            step_idx_tensor = layers.fill_constant(
+                shape=[1], dtype="int64", value=step_idx)
+            while cond.numpy():
+                (step_outputs, next_states, next_inputs,
+                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
+                                                    states, **kwargs)
+                if not self.decoder.tracks_own_finished:
+                    # BeamSearchDecoder would track it own finished, since
+                    # beams would be reordered and the finished status of each
+                    # entry might change. Otherwise, perform logical OR which
+                    # would not change the already finished.
+                    next_finished = layers.logical_or(next_finished, finished)
+                    # To confirm states.finished/finished be consistent with
+                    # next_finished.
+                    layers.assign(next_finished, finished)
+                next_sequence_lengths = layers.elementwise_add(
+                    sequence_lengths,
+                    layers.cast(
+                        layers.logical_not(finished), sequence_lengths.dtype))
+                if self.impute_finished:  # rectify the states for the finished.
+                    next_states = map_structure(
+                        lambda x, y: _maybe_copy(x, y, finished), states,
+                        next_states)
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if step_idx == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+                inputs, states, finished, sequence_lengths = (
+                    next_inputs, next_states, next_finished,
+                    next_sequence_lengths)
+                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
+                step_idx += 1
+                layers.logical_not(layers.reduce_all(finished), cond)
+                if self.max_step_num is not None and step_idx > self.max_step_num:
+                    break
+            final_outputs = map_structure(
+                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
+            final_states = states
+            try:
+                final_outputs, final_states = self.decoder.finalize(
+                    final_outputs, final_states, sequence_lengths)
+            except NotImplementedError:
+                pass
+            if not self.output_time_major:
+                final_outputs = map_structure(
+                    lambda x: layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), final_outputs)
+            return (final_outputs, final_states,
+                    sequence_lengths) if self.return_length else (final_outputs,
+                                                                  final_states)
+        else:
+            return fluid.layers.dynamic_decode(
+                self.decoder,
+                inits,
+                max_step_num=self.max_step_num,
+                output_time_major=self.output_time_major,
+                impute_finished=self.impute_finished,
+                is_test=self.is_test,
+                return_length=self.return_length,
+                **kwargs)
+class Conv1dPoolLayer(Layer):
+    """
+    This interface is used to construct a callable object of the ``Conv1DPoolLayer``
+    class. The ``Conv1DPoolLayer`` class does a ``Conv1D`` and a ``Pool1D`` .
+    For more details, refer to code examples.The ``Conv1DPoolLayer`` layer calculates
+    the output based on the input, filter and strides, paddings, dilations, groups,
+    global_pooling, pool_type, ceil_mode, exclusive parameters.
+    Parameters:
+        num_channels (int): The number of channels in the input data.
+        num_filters(int): The number of filters. It is the same as the output channels.
+        filter_size (int): The filter size of Conv1DPoolLayer.       
+        pool_size (int): The pooling size of Conv1DPoolLayer.
+        conv_stride (int): The stride size of the conv Layer in Conv1DPoolLayer.
+            Default: 1
+        pool_stride (int): The stride size of the pool layer in Conv1DPoolLayer.
+            Default: 1
+        conv_padding (int): The padding size of the conv Layer in Conv1DPoolLayer.
+            Default: 0
+        pool_padding (int): The padding of pool layer in Conv1DPoolLayer.
+            Default: 0
+        act (str): Activation type for conv layer, if it is set to None, activation
+            is not appended. Default: None.
+        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
+            average-pooling. Default: `max`
+        dilation (int): The dilation size of the conv Layer. Default: 1.
+        groups (int): The groups number of the conv Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2, the
+            first half of the filters is only connected to the first half of the
+            input channels, while the second half of the filters is only connected
+            to the second half of the input channels. Default: 1.
+        global_pooling (bool): Whether to use the global pooling. If it is true, 
+                `pool_size` and `pool_padding` would be ignored. Default: False
+        ceil_mode (bool, optional): Whether to use the ceil function to calculate output 
+                height and width.False is the default. If it is set to False, the floor function 
+                will be used. Default: False.
+        exclusive (bool, optional): Whether to exclude padding points in average pooling mode. 
+                Default: True.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: False
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+    Example:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import Conv1dPoolLayer
+            # input: [batch_size, num_channels, sequence_length]
+            input = paddle.rand((2, 32, 4))
+            cov2d = Conv1dPoolLayer(num_channels=32,
+                                    num_filters=64,
+                                    filter_size=2,
+                                    pool_size=2)
+            output = cov2d(input)
+    """
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 conv_stride=1,
+                 pool_stride=1,
+                 conv_padding=0,
+                 pool_padding=0,
+                 act=None,
+                 pool_type='max',
+                 global_pooling=False,
+                 dilation=1,
+                 groups=None,
+                 ceil_mode=False,
+                 exclusive=True,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(Conv1dPoolLayer, self).__init__()
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=[filter_size, 1],
+            stride=[conv_stride, 1],
+            padding=[conv_padding, 0],
+            dilation=[dilation, 1],
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act)
+        self._pool2d = Pool2D(
+            pool_size=[pool_size, 1],
+            pool_type=pool_type,
+            pool_stride=[pool_stride, 1],
+            pool_padding=[pool_padding, 0],
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn,
+            ceil_mode=ceil_mode,
+            exclusive=exclusive)
+    def forward(self, input):
+        """
+        Performs conv1d and pool1d on the input.
+        Parameters:
+            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
+                representing `batch_size`, `num_channels` and `sequence_length`
+                separately. data type can be float32 or float64
+        Returns:
+            Variable: The 3-D output tensor after conv and pool. It has the same \
+                data type as input.
+        """
+        x = fluid.layers.unsqueeze(input, axes=[-1])
+        x = self._conv2d(x)
+        x = self._pool2d(x)
+        x = fluid.layers.squeeze(x, axes=[-1])
+        return x
+class CNNEncoder(Layer):
+    """
+    This interface is used to construct a callable object of the ``CNNEncoder``
+    class. The ``CNNEncoder`` is composed of multiple ``Conv1dPoolLayer`` .
+    ``CNNEncoder`` can define every Conv1dPoolLayer with different or same parameters.
+    The ``Conv1dPoolLayer`` in ``CNNEncoder`` is parallel. The results of every 
+    ``Conv1dPoolLayer`` will concat at the channel dimension as the final output.
+    Parameters:
+        num_channels(int|list|tuple): The number of channels in the input data. If
+            `num_channels` is a list or tuple, the length of `num_channels` must
+            equal to `num_layers`. If `num_channels` is a int, all conv1dpoollayer's
+            `num_channels` are the value of `num_channels`. 
+        num_filters(int|list|tuple): The number of filters. It is the same as the
+            output channels. If `num_filters` is a list or tuple, the length of
+            `num_filters` must equal `num_layers`. If `num_filters` is a int,
+            all conv1dpoollayer's `num_filters` are the value of `num_filters`.
+        filter_size(int|list|tuple): The filter size of Conv1DPoolLayer in CNNEncoder.
+            If `filter_size` is a list or tuple, the length of `filter_size` must
+            equal `num_layers`. If `filter_size` is a int, all conv1dpoollayer's
+            `filter_size` are the value of `filter_size`. 
+        pool_size(int|list|tuple): The pooling size of Conv1DPoolLayer in CNNEncoder.
+            If `pool_size` is a list or tuple, the length of `pool_size` must equal
+            `num_layers`. If `pool_size` is a int, all conv1dpoollayer's `pool_size`
+            are the value of `pool_size`.
+        num_layers(int): The number of conv1dpoolLayer used in CNNEncoder.
+        conv_stride(int|list|tuple): The stride size of the conv Layer in Conv1DPoolLayer.
+            If `conv_stride` is a list or tuple, the length of `conv_stride` must
+            equal `num_layers`. If conv_stride is a int, all conv1dpoollayer's `conv_stride`
+            are the value of `conv_stride`. Default: 1
+        pool_stride(int|list|tuple): The stride size of the pool layer in Conv1DPoolLayer.
+            If `pool_stride` is a list or tuple, the length of `pool_stride` must
+            equal `num_layers`. If `pool_stride` is a int, all conv1dpoollayer's `pool_stride`
+            are the value of `pool_stride`. Default: 1
+        conv_padding(int|list|tuple): The padding size of the conv Layer in Conv1DPoolLayer.
+            If `conv_padding` is a list or tuple, the length of `conv_padding` must
+            equal `num_layers`. If `conv_padding` is a int, all conv1dpoollayer's `conv_padding`
+            are the value of `conv_padding`. Default: 0
+        pool_padding(int|list|tuple): The padding size of pool layer in Conv1DPoolLayer.
+            If `pool_padding` is a list or tuple, the length of `pool_padding` must
+            equal `num_layers`.If `pool_padding` is a int, all conv1dpoollayer's `pool_padding`
+            are the value of `pool_padding`. Default: 0
+        act (str|list|tuple): Activation type for `Conv1dPoollayer` layer, if it is set to None,
+            activation is not appended. Default: None.
+        pool_type (str): Pooling type can be `max` for max-pooling or `avg` for
+            average-pooling. Default: `max`
+        global_pooling (bool): Whether to use the global pooling. If it is true, 
+            `pool_size` and `pool_padding` would be ignored. Default: False
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: False
+    Example:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import CNNEncoder
+            # input: [batch_size, num_channels, sequence_length]
+            input = paddle.rand((2, 32, 8))
+            cov_encoder = CNNEncoder(num_layers=2,
+                                     num_channels=32,
+                                     num_filters=64,
+                                     filter_size=[2, 3],
+                                     pool_size=[7, 6])
+            output = cov_encoder(input)  # [2, 128, 1]
+    """
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 num_layers=1,
+                 conv_stride=1,
+                 pool_stride=1,
+                 conv_padding=0,
+                 pool_padding=0,
+                 act=None,
+                 pool_type='max',
+                 global_pooling=False,
+                 use_cudnn=False):
+        super(CNNEncoder, self).__init__()
+        self.num_layers = num_layers
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.filter_size = filter_size
+        self.pool_size = pool_size
+        self.conv_stride = conv_stride
+        self.pool_stride = pool_stride
+        self.conv_padding = conv_padding
+        self.pool_padding = pool_padding
+        self.use_cudnn = use_cudnn
+        self.act = act
+        self.pool_type = pool_type
+        self.global_pooling = global_pooling
+        self.conv1d_pool_layers = fluid.dygraph.LayerList([
+            Conv1dPoolLayer(
+                num_channels=self.num_channels
+                if isinstance(self.num_channels, int) else self.num_channels[i],
+                num_filters=self.num_filters
+                if isinstance(self.num_channels, int) else self.num_filters[i],
+                filter_size=self.filter_size
+                if isinstance(self.filter_size, int) else self.filter_size[i],
+                pool_size=self.pool_size
+                if isinstance(self.pool_size, int) else self.pool_size[i],
+                conv_stride=self.conv_stride
+                if isinstance(self.conv_stride, int) else self.conv_stride[i],
+                pool_stride=self.pool_stride
+                if isinstance(self.pool_stride, int) else self.pool_stride[i],
+                conv_padding=self.conv_padding
+                if isinstance(self.conv_padding, int) else self.conv_padding[i],
+                pool_padding=self.pool_padding
+                if isinstance(self.pool_padding, int) else self.pool_padding[i],
+                act=self.act[i]
+                if isinstance(self.act, (list, tuple)) else self.act,
+                pool_type=self.pool_type,
+                global_pooling=self.global_pooling,
+                use_cudnn=self.use_cudnn) for i in range(num_layers)
+        ])
+    def forward(self, input):
+        """
+        Performs multiple parallel conv1d and pool1d, and concat the results of
+        them at the channel dimension to produce the final output.
+        Parameters:
+            input (Variable): A 3-D Tensor, shape is [N, C, H] where N, C and H
+                representing `batch_size`, `num_channels` and `sequence_length`
+                separately. data type can be float32 or float64
+        Returns:
+            Variable: The 3-D output tensor produced by concatenating results of \
+                all Conv1dPoolLayer. It has the same data type as input.
+        """
+        res = [
+            conv1d_pool_layer(input)
+            for conv1d_pool_layer in self.conv1d_pool_layers
+        ]
+        out = fluid.layers.concat(input=res, axis=1)
+        return out
+class TransformerCell(RNNCell):
+    """
+    TransformerCell wraps a Transformer decoder producing logits from `inputs`
+    composed by ids and position.
+    Parameters:
+        decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
+            includes a embedding layer accepting ids and positions instead of embeddings
+            and includes a output layer transforming decoder output features to logits.
+        embedding_fn(function, optional): A callable that accepts ids and position
+            as arguments and return embeddings as input of `decoder`. It can be
+            None if `decoder` includes a embedding layer. Default None.
+        output_fn(callable, optional): A callable applid on `decoder` output to
+            transform decoder output features to get logits. Mostly it is a Linear
+            layer with vocabulary size. It can be None if `decoder` includes a
+            output layer. Default None.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+            paddle.enable_dygraph()
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Embedder, self).__init__()
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+                def forward(self, word, position):
+                    return self.word_embedder(word) + self.pos_embedder(position)
+            embedder = Embedder()
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    start_token=0,
+                    end_token=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num=10,
+                is_test=True)
+            enc_output = paddle.rand((2, 4, 128))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            caches = transformer_cell.get_initial_states(enc_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, beam_size=4)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
+    """
+    def __init__(self, decoder, embedding_fn=None, output_fn=None):
+        super(TransformerCell, self).__init__()
+        self.decoder = decoder
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
+    def forward(self,
+                inputs,
+                states=None,
+                enc_output=None,
+                trg_slf_attn_bias=None,
+                trg_src_attn_bias=None,
+                static_caches=[]):
+        """
+        Produces logits from `inputs` composed by ids and positions.
+        Parameters:
+            inputs(tuple): A tuple includes target ids and positions. The two
+                tensors both have int64 data type and with 2D shape 
+                `[batch_size, sequence_length]` where `sequence_length` is 1
+                for inference.
+            states(list): It caches the multi-head attention intermediate results
+                of history decoding steps. It is a list of dict where the length
+                of list is decoder layer number, and each dict has `k` and `v` as
+                keys and values are cached results. Default None
+            enc_output(Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data type
+                should be float32 or float64.
+            trg_slf_attn_bias(Variable, optional): A tensor used in decoder self
+                attention to mask out attention on unwanted target positions. It
+                is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. It can be None for inference. The data type should
+                be float32 or float64. Default None
+            trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder
+                cross attention to mask out unwanted attention on source (encoder output).
+                It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. The data type should be float32 or float64. Default None
+            static_caches(list): It stores projected results of encoder output
+                to be used as keys and values in decoder-encoder cross attention
+                It is a list of dict where the length of list is decoder layer
+                number, and each dict has `static_k` and `static_v` as keys and
+                values are stored results. Default empty list
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
+                is a float32 or float64 3D tensor representing logits shaped \
+                `[batch_size, sequence_length, vocab_size]`. `new_states has \
+                the same structure and data type with `states` while the length \
+                is one larger since the intermediate results of current step are \
+                concatenated into it.
+        """
+        trg_word, trg_pos = inputs
+        if states and static_caches:
+            for cache, static_cache in zip(states, static_caches):
+                cache.update(static_cache)
+        if self.embedding_fn is not None:
+            dec_input = self.embedding_fn(trg_word, trg_pos)
+            outputs = self.decoder(dec_input, enc_output, None,
+                                   trg_src_attn_bias, states)
+        else:
+            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
+                                   trg_src_attn_bias, states)
+        if self.output_fn is not None:
+            outputs = self.output_fn(outputs)
+        new_states = [{
+            "k": cache["k"],
+            "v": cache["v"]
+        } for cache in states] if states else states
+        return outputs, new_states
+    @property
+    def state_shape(self):
+        """
+        States of TransformerCell cache the multi-head attention intermediate
+        results of history decoding steps, and have a increasing length as
+        decoding continued.
+        `state_shape` of TransformerCell is used to initialize states. It is a
+        list of dict where the length of list is decoder layer, and each dict
+        has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]`
+        separately. (-1 for batch size would be automatically inserted into shape).
+        Returns:
+            list: It is a list of dict where the length of list is decoder layer \
+                number, and each dict has `k` and `v` as keys and values are cached \
+                results.
+        """
+        return [{
+            "k": [self.decoder.n_head, 0, self.decoder.d_key],
+            "v": [self.decoder.n_head, 0, self.decoder.d_value],
+        } for i in range(self.decoder.n_layer)]
+class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
+    """
+    Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`,
+    Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]`
+    and includes extra position data. And its `states` (caches) has increasing
+    length. These are not consistent with `BeamSearchDecoder`, thus subclass
+    `BeamSearchDecoder` to make beam search adapt to Transformer decoder.
+    Parameters:
+        cell(TransformerCell): An instance of `TransformerCell`.
+        start_token(int): The start token id.
+        end_token(int): The end token id.
+        beam_size(int): The beam width used in beam search.
+        var_dim_in_state(int): Indicate which dimension of states is variant.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+            paddle.enable_dygraph()
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Embedder, self).__init__()
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+                def forward(self, word, position):
+                    return self.word_embedder(word) + self.pos_embedder(position)
+            embedder = Embedder()
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    start_token=0,
+                    end_token=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num=10,
+                is_test=True)
+            enc_output = paddle.rand((2, 4, 128))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            caches = transformer_cell.get_initial_states(enc_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, beam_size=4)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
+    """
+    def __init__(self, cell, start_token, end_token, beam_size,
+                 var_dim_in_state):
+        super(TransformerBeamSearchDecoder,
+              self).__init__(cell, start_token, end_token, beam_size)
+        self.cell = cell
+        self.var_dim_in_state = var_dim_in_state
+    def _merge_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
+        tensor with shape `[batch_size * beam_size, ...]`. 
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+        Returns:
+            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
+        # init length of cache is 0, and it increases with decoding carrying on,
+        # thus need to reshape elaborately
+        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
+        x = layers.transpose(x,
+                             list(range(var_dim_in_state, len(x.shape))) +
+                             list(range(0, var_dim_in_state)))
+        x = layers.reshape(
+            x, [0] * (len(x.shape) - var_dim_in_state
+                      ) + [self.batch_size * self.beam_size] +
+            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
+        x = layers.transpose(
+            x,
+            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
+            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
+        return x
+    def _split_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
+        tensor with shape `[batch_size, beam_size, ...]`. 
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+        Returns:
+            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
+                data type is same as `x`.     
+        """
+        var_dim_size = layers.shape(x)[self.var_dim_in_state]
+        x = layers.reshape(
+            x, [-1, self.beam_size] +
+            [int(size)
+             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
+            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
+        return x
+    def step(self, time, inputs, states, **kwargs):
+        """
+        Perform a beam search decoding step, which uses `cell` to get probabilities,
+        and follows a beam search step to calculate scores and select candidate
+        token ids.
+        Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped
+        `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined
+        position data as inputs to `cell`.
+        Parameters:
+            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
+                representing the current time step number of decoding.
+            inputs(Variable): A tensor variable. It is same as `initial_inputs`
+                returned by `initialize()` for the first decoding step and
+                `next_inputs` returned by `step()` for the others. It is a int64
+                id tensor with shape `[batch_size * beam_size]`
+            states(Variable): A structure of tensor variables.
+                It is same as the `initial_states` returned by `initialize()` for
+                the first decoding step and `beam_search_state` returned by
+                `step()` for the others.
+            **kwargs: Additional keyword arguments, provided by the caller. 
+        Returns:
+            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
+                `beam_search_state` and `next_inputs` have the same structure, \
+                shape and data type as the input arguments `states` and `inputs` separately. \
+                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
+                parent_ids as fields) of tensor variables, where \
+                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
+                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
+                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
+        """
+        # compared to RNN, Transformer has 3D data at every decoding step
+        inputs = layers.reshape(inputs, [-1, 1])  # token
+        pos = layers.ones_like(inputs) * time  # pos
+        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
+                                    states.cell_states)
+        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
+                                                   **kwargs)
+        # squeeze to adapt to BeamSearchDecoder which use 2D logits
+        cell_outputs = map_structure(
+            lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x,
+            cell_outputs)
+        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
+        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
+                                         next_cell_states)
+        beam_search_output, beam_search_state = self._beam_search_step(
+            time=time,
+            logits=cell_outputs,
+            next_cell_states=next_cell_states,
+            beam_state=states)
+        next_inputs, finished = (beam_search_output.predicted_ids,
+                                 beam_search_state.finished)
+        return (beam_search_output, beam_search_state, next_inputs, finished)
+### Transformer Modules ###
+class PrePostProcessLayer(Layer):
+    """
+    PrePostProcessLayer is used before/after each multi-head attention(MHA) and
+    feed-forward network(FFN) sub-layer to perform some specific process on
+    inputs/outputs.
+    Parameters:
+        process_cmd (str): The process applied before/after each MHA and
+            FFN sub-layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization.
+        d_model (int): The expected feature size in the input and output.
+        dropout_rate (float): The dropout probability if the process includes
+            dropout. Default 0.1
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import PrePostProcessLayer
+            # input: [batch_size, sequence_length, d_model]
+            x = paddle.rand((2, 4, 32))
+            process = PrePostProcessLayer('n', 32)
+            out = process(x)  # [2, 4, 32]
+    """
+    def __init__(self, process_cmd, d_model, dropout_rate=0.1):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(lambda x, y: x + y if y is not None else x)
+            elif cmd == "n":  # add layer normalization
+                layer_norm = LayerNorm(
+                    normalized_shape=d_model,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.)),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(0.)))
+                self.functors.append(
+                    self.add_sublayer(
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        layer_norm))
+            elif cmd == "d":  # add dropout
+                self.functors.append(lambda x: layers.dropout(
+                    x, dropout_prob=dropout_rate, is_test=False)
+                                     if dropout_rate else x)
+    def forward(self, x, residual=None):
+        """
+        Applies `process_cmd` specified process on `x`.
+        Parameters:
+            x (Variable): The tensor to be processed. The data type should be float32
+                or float64. The shape is `[batch_size, sequence_length, d_model]`.
+            residual (Variable, optional): Only used if the process includes
+                residual connection. It has the same shape and data type as `x`.
+                Default None
+        Returns:
+            Variable: The processed tensor. It has the same shape and data type \
+                    as `x`.
+        """
+        for i, cmd in enumerate(self.process_cmd):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
+            else:
+                x = self.functors[i](x)
+        return x
+class MultiHeadAttention(Layer):
+    """
+    MultiHead Attention mapps queries and a set of key-value pairs to outputs
+    by jointly attending to information from different representation subspaces,
+    as what multi-head indicates it performs multiple attention in parallel.
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+    Parameters:
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        dropout_rate (float, optional): The dropout probability used in MHA to
+            drop some attention target. Default 0.1
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import MultiHeadAttention
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2)
+            output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
+    """
+    def __init__(self, d_key, d_value, d_model, n_head, dropout_rate=0.1):
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.dropout_rate = dropout_rate
+        self.q_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.k_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.v_fc = Linear(
+            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
+        self.proj_fc = Linear(
+            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        """
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple attention in parallel. If `cache` is not None, using cached
+        results to reduce redundant calculations.
+        Parameters:
+            queries (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            cache(dict, optional): It is a dict with `k` and `v` as keys, and
+                values cache the multi-head attention intermediate results of
+                history decoding steps for decoder self attention; Or a dict
+                with `static_k` and `statkc_v` as keys, and values stores intermediate
+                results of encoder output for decoder-encoder cross attention.
+                If it is for decoder self attention, values for `k` and `v` would
+                be updated by new tensors concatanating raw tensors with intermediate
+                results of current step. It is only used for inference and should
+                be None for training. Default None
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
+        q = self.q_fc(queries)
+        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        if cache is not None:
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = layers.concat([cache_k, k], axis=2)
+                v = layers.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
+        return q, k, v
+    def forward(self,
+                queries,
+                keys=None,
+                values=None,
+                attn_bias=None,
+                cache=None):
+        """
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+        Parameters:
+            queries (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            attn_bias (Variable, optional): A tensor used in multi-head attention
+                to mask out attention on unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cache(dict, optional): It is a dict with `k` and `v` as keys, and
+                values cache the multi-head attention intermediate results of
+                history decoding steps for decoder self attention; Or a dict
+                with `static_k` and `statkc_v` as keys, and values stores intermediate
+                results of encoder output for decoder-encoder cross attention.
+                If it is for decoder self attention, values for `k` and `v` would
+                be updated by new tensors concatanating raw tensors with intermediate
+                results of current step. It is only used for inference and should
+                be None for training. Default None
+        Returns:
+            Variable: The output of multi-head attention. It is a tensor \
+                that has the same shape and data type as `queries`.
+        """
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.d_key**-0.5)
+        if attn_bias is not None:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if self.dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=self.dropout_rate, is_test=False)
+        out = layers.matmul(weights, v)
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+        # project to output
+        out = self.proj_fc(out)
+        return out
+    def cal_kv(self, keys, values):
+        """
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces for usage of subsequnt multiple attention in parallel.
+        Parameters:
+            keys (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`. The
+                data type should be float32 or float64.
+            values (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
+        k = self.k_fc(keys)
+        v = self.v_fc(values)
+        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+class FFN(Layer):
+    """
+    A fully connected feed-forward network applied to each position separately
+    and identically. This consists of two linear transformations with a activation
+    and dropout in between.
+    Parameters:
+        d_inner_hid (int): The hidden size in the feedforward network(FFN).
+        d_model (int): The expected feature size in the input and output.
+        dropout_rate (float, optional): The dropout probability used after
+            activition. Default 0.1
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import FFN
+            # input: [batch_size, sequence_length, d_model]
+            x = paddle.rand((2, 4, 32))
+            ffn = FFN(128, 32)
+            out = ffn(x)  # [2, 4, 32]
+    """
+    def __init__(self, d_inner_hid, d_model, dropout_rate=0.1, fc1_act="relu"):
+        super(FFN, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.fc1 = Linear(
+            input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+    def forward(self, x):
+        """
+        Applies a fully connected feed-forward network on each position  of the
+        input sequences separately and identically.
+        Parameters:
+            x (Variable): The input of feed-forward network. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+        Returns:
+            Variable: The output of feed-forward network. It is a tensor that has \
+                the same shape and data type as `enc_input`.
+        """
+        hidden = self.fc1(x)
+        if self.dropout_rate:
+            hidden = layers.dropout(
+                hidden, dropout_prob=self.dropout_rate, is_test=False)
+        out = self.fc2(hidden)
+        return out
+class TransformerEncoderLayer(Layer):
+    """
+    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    attention and feedforward network. Before and after each sub-layer, pre-process
+    and post-precess would be applied on the input and output.
+    Parameters:
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerEncoderLayer
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
+            enc_output = encoder_layer(enc_input, attn_bias)  # [2, 4, 128]
+    """
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
+        super(TransformerEncoderLayer, self).__init__()
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+    def forward(self, enc_input, attn_bias=None):
+        """
+        Applies a Transformer encoder layer on the input.
+        Parameters:
+            enc_input (Variable): The input of Transformer encoder layer. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            attn_bias(Variable, optional): A tensor used in encoder self attention
+                to mask out attention on unwanted positions, usually the paddings. It
+                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+        Returns:
+            Variable: The output of Transformer encoder layer. It is a tensor that \
+                has the same shape and data type as `enc_input`.
+        """
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+class TransformerEncoder(Layer):
+    """
+    TransformerEncoder is a stack of N encoder layers.
+    Parameters:
+        n_layer (int): The number of encoder layers to be stacked.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerEncoder
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
+            enc_output = encoder(enc_input, attn_bias)  # [2, 4, 128]
+    """
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
+        super(TransformerEncoder, self).__init__()
+        self.encoder_layers = list()
+        for i in range(n_layer):
+            self.encoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerEncoderLayer(
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        ffn_fc1_act=ffn_fc1_act)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+    def forward(self, enc_input, attn_bias=None):
+        """
+        Applies a stack of N Transformer encoder layers on input sequences.
+        Parameters:
+            enc_input (Variable): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+            attn_bias(Variable, optional): A tensor used in encoder self attention
+                to mask out attention on unwanted positions, usually the paddings. It
+                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+        Returns:
+            Variable: The output of Transformer encoder. It is a tensor that has \
+                the same shape and data type as `enc_input`.
+        """
+        for encoder_layer in self.encoder_layers:
+            enc_output = encoder_layer(enc_input, attn_bias)
+            enc_input = enc_output
+        return self.processer(enc_output)
+class TransformerDecoderLayer(Layer):
+    """
+    TransformerDecoderLayer is composed of three sub-layers which are decoder
+    self (multi-head) attention, decoder-encoder cross attention and feedforward
+    network. Before and after each sub-layer, pre-process and post-precess would
+    be applied on the input and output.
+    Parameters:
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerDecoderLayer
+            # decoder input: [batch_size, trg_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention bias: [batch_size, n_head, trg_len, trg_len]
+            self_attn_bias = paddle.rand((2, 2, 4, 4))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            cross_attn_bias = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(2, 64, 64, 128, 512)
+            output = decoder_layer(dec_input,
+                                   enc_output,
+                                   self_attn_bias,
+                                   cross_attn_bias)  # [2, 4, 128]
+    """
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
+        super(TransformerDecoderLayer, self).__init__()
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                             attention_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
+        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias=None,
+                cross_attn_bias=None,
+                cache=None):
+        """
+        Applies a Transformer decoder layer on the input.
+        Parameters:
+            dec_input (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            self_attn_bias (Variable, optional): A tensor used in decoder self attention
+                to mask out attention on unwanted positions, usually the subsequent positions.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
+                attention to mask out attention on unwanted positions, usually the paddings.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            caches(dict, optional): It caches the multi-head attention intermediate
+                results of history decoding steps and encoder output. It is a dict
+                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
+                results. It is only used for inference and should be None for
+                training. Default None
+        Returns:
+            Variable: The output of Transformer decoder layer. It is a tensor \
+                that has the same shape and data type as `dec_input`.
+        """
+        self_attn_output = self.self_attn(
+            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
+        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+        cross_attn_output = self.cross_attn(
+            self.preprocesser2(self_attn_output), enc_output, enc_output,
+            cross_attn_bias, cache)
+        cross_attn_output = self.postprocesser2(cross_attn_output,
+                                                self_attn_output)
+        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
+        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+        return ffn_output
+class TransformerDecoder(Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers.
+    Parameters:
+        n_layer (int): The number of encoder layers to be stacked.
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerDecoder
+            # decoder input: [batch_size, trg_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention bias: [batch_size, n_head, trg_len, trg_len]
+            self_attn_bias = paddle.rand((2, 2, 4, 4))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            cross_attn_bias = paddle.rand((2, 2, 4, 6))
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            dec_output = decoder(dec_input,
+                                 enc_output,
+                                 self_attn_bias,
+                                 cross_attn_bias)  # [2, 4, 128]
+    """
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout=0.1,
+                 attention_dropout=0.1,
+                 relu_dropout=0.1,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
+        super(TransformerDecoder, self).__init__()
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.decoder_layers = list()
+        for i in range(n_layer):
+            self.decoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerDecoderLayer(n_head, d_key, d_value, d_model,
+                                            d_inner_hid, prepostprocess_dropout,
+                                            attention_dropout, relu_dropout,
+                                            preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias=None,
+                cross_attn_bias=None,
+                caches=None):
+        """
+        Applies a stack of N Transformer decoder layers on inputs.
+        Parameters:
+            dec_input (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            self_attn_bias (Variable, optional): A tensor used in decoder self attention
+                to mask out attention on unwanted positions, usually the subsequent positions.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            cross_attn_bias (Variable, optional): A tensor used in decoder-encoder cross
+                attention to mask out attention on unwanted positions, usually the paddings.
+                It is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+            caches(list, optional): It caches the multi-head attention intermediate results
+                of history decoding steps and encoder output. It is a list of dict
+                where the length of list is decoder layer number, and each dict
+                has `k`, `v`, `static_k`, `statkc_v` as keys and values are cached
+                results. It is only used for inference and should be None for
+                training. Default None
+        Returns:
+            Variable: The output of Transformer decoder. It is a tensor that has \
+                the same shape and data type as `dec_input`.
+        """
+        for i, decoder_layer in enumerate(self.decoder_layers):
+            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
+                                       cross_attn_bias, caches[i]
+                                       if caches else None)
+            dec_input = dec_output
+        return self.processer(dec_output)
+    def prepare_static_cache(self, enc_output):
+        """
+        Generate a list of dict where the length of list is decoder layer number.
+        Each dict has `static_k`, `statkc_v` as keys, and values are projected
+        results of encoder output to be used as keys and values in decoder-encoder
+        cross (multi-head) attention. Used in inference.
+        Parameters:
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+        Returns:
+            list: A list of dict. Each dict has `static_k`, `statkc_v` as keys, \
+                and values are projected results of encoder output to be used as \
+                keys and values in decoder-encoder cross (multi-head) attention.
+        """
+        return [
+            dict(
+                zip(("static_k", "static_v"),
+                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
+            for decoder_layer in self.decoder_layers
+        ]
+    def prepare_incremental_cache(self, enc_output):
+        """
+        Generate a list of dict where the length of list is decoder layer number.
+        Each dict has `k`, `v` as keys, and values are empty tensors with shape
+        `[batch_size, n_head, 0, d_key]` and `[batch_size, n_head, 0, d_value]`,
+        representing the decoder self (multi-head) attention intermediate results,
+        and 0 is the initial length which would increase as inference decoding
+        continued. Used in inference.
+        Parameters:
+            enc_output (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64. Actually, it is used to provide batch
+                size for Transformer initial states(caches), thus any tensor has
+                wanted batch size can be used here.
+        Returns:
+            list: A list of dict. Each dict has `k`, `v` as keys, and values are \
+                empty tensors representing intermediate results of history decoding \
+                steps in decoder self (multi-head) attention at time step 0.
+        """
+        return [{
+            "k": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_key],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=enc_output,
+                shape=[-1, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+class LinearChainCRF(Layer):
+    """
+    Computes the negtive log-likelihood of tag sequences in a linear chain CRF. 
+    Using terminologies of undirected probabilistic graph model, it calculates
+    probability using unary potentials (for emission) and binary potentials 
+    (for transition). 
+    This layer creates a learnable parameter shaped `[size + 2, size]` (`size`
+    is for the number of tags), where:
+    1. the first row is for starting weights, denoted as $a$ here
+    2. the second row is for ending weights, denoted as $b$ here.
+    3. the remaining rows is a matrix for transition weights. 
+    Denote input tensor of unary potentials(emission) as $x$ , then the probability
+    of a tag sequence $s$ of length $L$ is defined as:
+    $$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                    + \sum_{l=1}^L x_{s_l}
+                    + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+    where $Z$ is a normalization value so that the sum of $P(s)$ over
+    all possible sequences is 1, and $x$ is the emission feature weight
+    to the linear chain CRF.
+    This operator implements the Forward-Backward algorithm for the linear chain
+    CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+    http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
+    NOTE:
+    1. The feature function for a CRF is made up of the emission features and the
+    transition features. The emission feature weights are NOT computed in
+    this operator. They MUST be computed first before this operator is called.
+    2. Because this operator performs global normalization over all possible
+    sequences internally, it expects UNSCALED emission feature weights.
+    Please do not call this op with the emission feature being output of any
+    nonlinear activation.
+    3. The 2nd dimension of input(emission) MUST be equal to the tag number.
+    Parameters:
+        size (int): The number of tags.
+        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
+            transition. Default: None
+        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
+            Default: `float32`
+    Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LinearChainCRF
+            # emission: [batch_size, sequence_length, num_tags]
+            emission = paddle.rand((2, 8, 5))
+            # label: [batch_size, sequence_length, num_tags]
+            # dummy label just for example usage
+            label = paddle.ones((2, 8), dtype='int64')  
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
+            crf = LinearChainCRF(size=5)
+            cost = crf(emission, label, length)  # [2, 1]
+    """
+    def __init__(self, size, param_attr=None, dtype='float32'):
+        super(LinearChainCRF, self).__init__()
+        self._param_attr = param_attr
+        self._dtype = dtype
+        self._size = size
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+    @property
+    def weight(self):
+        """
+        getter for transition matrix parameter
+        Returns:
+            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
+        return self._transition
+    @weight.setter
+    def weight(self, value):
+        """
+        setter for transition matrix parameter
+        Parameters:
+            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
+        self._transition = value
+    def forward(self, input, label, length):
+        """
+        Computes the log-likelihood of tag sequences in a linear chain CRF.
+        Parameters:
+            input (Variable): The input of unary potentials(emission). It is a
+                tensor with shape `[batch_size, sequence_length, num_tags]`.
+                The data type should be float32 or float64.
+            label (Variable): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64.
+            length (Variable): A tensor with shape `[batch_size]`. It stores real
+                length of each sequence for correctness.
+        Returns:
+            Variable: The negtive log-likelihood of tag sequences. It is a tensor \
+                with shape `[batch_size, 1]` and has float32 or float64 data type.
+        """
+        alpha = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        emission_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        transition_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        log_likelihood = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": [label]
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='linear_chain_crf',
+            inputs=this_inputs,
+            outputs={
+                "Alpha": [alpha],
+                "EmissionExps": [emission_exps],
+                "TransitionExps": transition_exps,
+                "LogLikelihood": log_likelihood
+            })
+        return log_likelihood
+class CRFDecoding(Layer):
+    """
+    CRFDecoding reads the emission feature weights and the transition
+    feature weights learned by the `LinearChainCRF` and performs decoding. 
+    It implements the Viterbi algorithm which is a dynamic programming algorithm 
+    for finding the most likely sequence of hidden states, called the Viterbi path, 
+    that results in a sequence of observed tags.
+    The output of this layer changes according to whether `label` is given:
+    1. `label` is given:
+    This happens in training. This operator is used to co-work with the chunk_eval
+    operator. When `label` is given, it returns tensor with the same shape as 
+    `label` whose values are fixed to be 0, indicating an incorrect prediction,
+    or 1 indicating a tag is correctly predicted. Such an output is the input to
+    chunk_eval operator.
+    2. `label` is not given:
+    This is the standard decoding process and get the highest scoring sequence
+    of tags.
+    Parameters:
+        size (int): The number of tags.
+        param_attr (ParamAttr, optional): The attribute of the learnable parameter for
+            transition. Default: None
+        dtype (str, optional): Data type, it can be 'float32' or 'float64'.
+            Default: `float32`
+    Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import CRFDecoding
+            # emission: [batch_size, sequence_length, num_tags]
+            emission = paddle.rand((2, 8, 5))
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
+            crf_decoding = CRFDecoding(size=5)
+            cost = crf_decoding(emission, length)  # [2, 8]
+    """
+    def __init__(self, size, param_attr=None, dtype='float32'):
+        super(CRFDecoding, self).__init__()
+        self._dtype = dtype
+        self._size = size
+        self._param_attr = param_attr
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+    @property
+    def weight(self):
+        """
+        getter for transition matrix parameter
+        Returns:
+            Parameter: The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
+        return self._transition
+    @weight.setter
+    def weight(self, value):
+        """
+        setter for transition matrix parameter
+        Parameters:
+            value (Parameter): The learnable transition parameter shaped `[size + 2, size]` \
+                (`size` is for the number of tags). The data type should be float32 \
+                or float64.
+        """
+        self._transition = value
+    def forward(self, input, length, label=None):
+        """
+        Performs sequence tagging prediction.
+        Parameters:
+            input (Variable): The input of unary potentials(emission). It is a
+                tensor with shape `[batch_size, sequence_length, num_tags]`.
+                The data type should be float32 or float64.
+            length (Variable): A tensor with shape `[batch_size]`.
+                It stores real length of each sequence for correctness.
+            label (Variable, optional): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64. Default None.
+        Returns:
+            Variable: A tensor with shape `[batch_size, sequence_length]` and \
+                int64 data type. If `label` is None, the tensor has binary values \
+                indicating a correct or incorrect prediction. Otherwise its values \
+                range from 0 to maximum tag number - 1, each element indicates \
+                an index of a predicted tag.
+        """
+        viterbi_path = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": label
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='crf_decoding',
+            inputs=this_inputs,
+            outputs={"ViterbiPath": [viterbi_path]})
+        return viterbi_path
+class _GRUEncoder(Layer):
+    """
+    A multi-layer bidirectional GRU encoder used by SequenceTagging.
+    """
+    def __init__(self,
+                 input_dim,
+                 grnn_hidden_dim,
+                 init_bound,
+                 num_layers=1,
+                 is_bidirection=False):
+        super(_GRUEncoder, self).__init__()
+        self.num_layers = num_layers
+        self.is_bidirection = is_bidirection
+        self.gru_list = []
+        self.gru_r_list = []
+        for i in range(num_layers):
+            self.basic_gru_cell = BasicGRUCell(
+                input_size=input_dim if i == 0 else input_dim * 2,
+                hidden_size=grnn_hidden_dim,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.UniformInitializer(
+                        low=-init_bound, high=init_bound),
+                    regularizer=fluid.regularizer.L2DecayRegularizer(
+                        regularization_coeff=1e-4)))
+            self.gru_list.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    RNN(self.basic_gru_cell, is_reverse=False,
+                        time_major=False)))
+        if self.is_bidirection:
+            for i in range(num_layers):
+                self.basic_gru_cell_r = BasicGRUCell(
+                    input_size=input_dim if i == 0 else input_dim * 2,
+                    hidden_size=grnn_hidden_dim,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.UniformInitializer(
+                            low=-init_bound, high=init_bound),
+                        regularizer=fluid.regularizer.L2DecayRegularizer(
+                            regularization_coeff=1e-4)))
+                self.gru_r_list.append(
+                    self.add_sublayer(
+                        "gru_r_%d" % i,
+                        RNN(self.basic_gru_cell_r,
+                            is_reverse=True,
+                            time_major=False)))
+    def forward(self, input_feature, h0=None):
+        for i in range(self.num_layers):
+            pre_gru, pre_state = self.gru_list[i](input_feature)
+            if self.is_bidirection:
+                gru_r, r_state = self.gru_r_list[i](input_feature)
+                out = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
+            else:
+                out = pre_gru
+            input_feature = out
+        return out
+class SequenceTagging(Layer):
+    """
+    Sequence tagging model using multi-layer bidirectional GRU as backbone and
+    linear chain CRF as output layer.
+    Parameters:
+        vocab_size (int): The size of vocabulary.
+        num_labels (int): The number of labels.
+        word_emb_dim (int, optional): The embedding size. Defalut 128
+        grnn_hidden_dim (int, optional): The hidden size of GRU. Defalut 128
+        emb_learning_rate (int, optional): The partial learning rate for embedding.
+            The actual learning rate for embedding would multiply it with the global
+            learning rate. Default 0.1
+        crf_learning_rate (int, optional): The partial learning rate for crf. The
+            actual learning rate for embedding would multiply it with the global
+            learning rate. Default 0.1
+        bigru_num (int, optional): The number of bidirectional GRU layers.
+            Default 2
+        init_bound (float, optional): The range for uniform initializer would
+            be `(-init_bound, init_bound)`. It would be used for all parameters
+            except CRF transition matrix. Default 0.1
+    Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import SequenceTagging
+            # word: [batch_size, sequence_length]
+            # dummy input just for example
+            word = paddle.ones((2, 8), dtype='int64')
+            length = fluid.layers.assign(np.array([6, 8]).astype('int64'))
+            seq_tagger = SequenceTagging(vocab_size=100, num_labels=5)
+            outputs = seq_tagger(word, length)
+    """
+    def __init__(self,
+                 vocab_size,
+                 num_labels,
+                 word_emb_dim=128,
+                 grnn_hidden_dim=128,
+                 emb_learning_rate=0.1,
+                 crf_learning_rate=0.1,
+                 bigru_num=2,
+                 init_bound=0.1):
+        super(SequenceTagging, self).__init__()
+        self.word_emb_dim = word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.grnn_hidden_dim = grnn_hidden_dim
+        self.emb_lr = emb_learning_rate
+        self.crf_lr = crf_learning_rate
+        self.bigru_num = bigru_num
+        self.init_bound = 0.1
+        self.word_embedding = Embedding(
+            size=[self.vocab_size, self.word_emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(
+                learning_rate=self.emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound)))
+        self.gru_encoder = _GRUEncoder(
+            input_dim=self.grnn_hidden_dim,
+            grnn_hidden_dim=self.grnn_hidden_dim,
+            init_bound=self.init_bound,
+            num_layers=self.bigru_num,
+            is_bidirection=True)
+        self.fc = Linear(
+            input_dim=self.grnn_hidden_dim * 2,
+            output_dim=self.num_labels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        self.linear_chain_crf = LinearChainCRF(
+            param_attr=fluid.ParamAttr(
+                name='linear_chain_crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+        self.crf_decoding = CRFDecoding(
+            param_attr=fluid.ParamAttr(
+                name='crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+    def forward(self, word, lengths, target=None):
+        """
+        Performs sequence tagging. If `target` is None, it is for training and
+        loss would be returned, otherwise it is for inference and returns the
+        predicted tags.
+        Parameters:
+            word (Variable): The input sequences to be labeled. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type should
+                be int64.
+            lengths (Variable): A tensor with shape `[batch_size]`. It stores real
+                length of each sequence.
+            target (Variable, optional): The golden sequence tags. It is a tensor
+                with shape `[batch_size, sequence_length]`. The data type
+                should be int64. It could be None for inference. Default None.
+        Returns:
+            tuple: A tuple( :code:`(crf_decode, avg_cost, lengths)` ) If input \
+                argument `target` is provided, including the most likely sequence \
+                tags, the averaged CRF cost and the sequence lengths, the shapes \
+                are `[batch_size, sequence_length]`, `[1]` and `[batch_size]`, \
+                and the data types are int64, float32 and int64. Otherwise A \
+                tuple( :code:`(crf_decode, lengths)` ) for inference.
+        """
+        word_embed = self.word_embedding(word)
+        input_feature = word_embed
+        bigru_output = self.gru_encoder(input_feature)
+        emission = self.fc(bigru_output)
+        if target is not None:
+            crf_cost = self.linear_chain_crf(
+                input=emission, label=target, length=lengths)
+            avg_cost = fluid.layers.mean(x=crf_cost)
+            self.crf_decoding.weight = self.linear_chain_crf.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, avg_cost, lengths
+        else:
+            self.linear_chain_crf.weight = self.crf_decoding.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, lengths
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -183,6 +183,7 @@ packages=['paddle',
          'paddle.incubate.hapi.vision',
          'paddle.incubate.hapi.vision.models',
          'paddle.incubate.hapi.vision.transforms',
+          'paddle.incubate.hapi.text',
          'paddle.io',
          'paddle.nn',
          'paddle.nn.functional',

--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -334,6 +334,34 @@
        "ParallelEnv",
        "DataParallel",
        "DataParallel.scale_loss",
-        "DataParallel.apply_collective_grads"
+        "DataParallel.apply_collective_grads",
+        "BasicLSTMCell.forward",
+        "BasicGRUCell.forward",
+        "RNN.forward",
+        "StackedRNNCell.forward",
+        "StackedLSTMCell.forward",
+        "LSTM.forward",
+        "BidirectionalRNN.forward",
+        "BidirectionalLSTM.forward",
+        "StackedGRUCell.forward",
+        "GRU.forward",
+        "BidirectionalGRU.forward",
+        "DynamicDecode.forward",
+        "Conv1dPoolLayer.forward",
+        "CNNEncoder.forward",
+        "TransformerCell.forward",
+        "TransformerBeamSearchDecoder.step",
+        "MultiHeadAttention.forward",
+        "MultiHeadAttention.cal_kv",
+        "FFN.forward",
+        "TransformerEncoderLayer.forward",
+        "TransformerEncoder.forward",
+        "TransformerDecoderLayer.forward",
+        "TransformerDecoder.forward",
+        "TransformerDecoder.prepare_static_cache",
+        "TransformerDecoder.prepare_incremental_cache",
+        "LinearChainCRF.forward",
+        "CRFDecoding.forward",
+        "SequenceTagging.forward"
    ]
 }