transformer_block.py 3.6 KB
Newer Older
S
sserdoubleh 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
TransformerBlock class.
"""

import paddle.fluid as fluid
from paddle.fluid.dygraph import FC
from paddle.fluid.dygraph import Layer
import paddle.fluid.layers as layers

from plato.modules.feedforward import FeedForward
from plato.modules.layer_norm import LayerNorm
from plato.modules.multihead_attention import MultiheadAttention
import plato.modules.functions as F


class TransformerBlock(Layer):
    """
    Transformer block module.
    """

    def __init__(self, name_scope, hidden_dim, num_heads, dropout, attn_dropout, ff_dropout):
        super().__init__(name_scope)

        self.attn = MultiheadAttention(name_scope=self.full_name(),
                                       hidden_dim=hidden_dim,
                                       num_heads=num_heads,
                                       dropout=attn_dropout)
        self.attn_norm = LayerNorm(name_scope=self.full_name(),
                                   begin_norm_axis=2,
                                   epsilon=1e-12,
                                   param_attr=fluid.ParamAttr(
                                       regularizer=fluid.regularizer.L2Decay(0.0)),
                                   bias_attr=fluid.ParamAttr(
                                       regularizer=fluid.regularizer.L2Decay(0.0)))
        self.ff = FeedForward(name_scope=self.full_name(),
                              hidden_dim=hidden_dim,
                              inner_dim=4 * hidden_dim,
                              dropout=ff_dropout)
        self.ff_norm = LayerNorm(name_scope=self.full_name(),
                                 begin_norm_axis=2,
                                 epsilon=1e-12,
                                 param_attr=fluid.ParamAttr(
                                     regularizer=fluid.regularizer.L2Decay(0.0)),
                                 bias_attr=fluid.ParamAttr(
                                     regularizer=fluid.regularizer.L2Decay(0.0)))
        self.dropout = dropout
        return

    def forward(self, inp, mask=None, cache=None):
        """
        Forward process on one transformer layer.

        @param : x
        @type : Variable(shape: [batch_size, seq_len, hidden_size])

        @param : memory
        @type : Variable(shape: [batch_size, seq_len, hidden_size])

        @param : mask

        @param : cache
        """
        attn_out = self.attn(inp, mask, cache)
        attn_out = F.dropout(attn_out, self.dropout)
        attn_out = self.attn_norm(attn_out + inp)

        ff_out = self.ff(attn_out)
        ff_out = F.dropout(ff_out, self.dropout)
        ff_out = self.ff_norm(ff_out + attn_out)

        return ff_out


def main():
    import numpy as np

    place = fluid.CPUPlace()
    with fluid.dygraph.guard(place):
        model = TransformerBlock("TransformerBlock", 10, 2, 0.5, 0.5, 0.5)
        inp = np.random.rand(2, 3, 10).astype("float32")
        inp = fluid.dygraph.to_variable(inp)
        out = model(inp, inp)
        print(out)


if __name__ == "__main__":
    main()