embedding.py 6.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
H
Hui Zhang 已提交
14
# Modified from wenet(https://github.com/wenet-e2e/wenet)
15 16 17 18 19 20 21
"""Positonal Encoding Module."""
import math
from typing import Tuple

import paddle
from paddle import nn

22
from paddlespeech.s2t.utils.log import Log
23

24
logger = Log(__name__).getlog()
25

H
Hui Zhang 已提交
26
__all__ = [
H
Hui Zhang 已提交
27 28
    "PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding",
    "RelPositionalEncoding"
H
Hui Zhang 已提交
29 30
]

31

H
Hui Zhang 已提交
32 33 34
class PositionalEncodingInterface:
    def forward(self, x: paddle.Tensor,
                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
35 36 37 38 39 40 41 42
        """Compute positional encoding.
        Args:
            x (paddle.Tensor): Input tensor (batch, time, `*`).
        Returns:
            paddle.Tensor: Encoded tensor (batch, time, `*`).
            paddle.Tensor: Positional embedding tensor (1, time, `*`).
        """
        raise NotImplementedError("forward method is not implemented")
H
Hui Zhang 已提交
43 44

    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
45 46 47 48 49 50 51 52 53 54 55
        """ For getting encoding in a streaming fashion
        Args:
            offset (int): start offset
            size (int): requried size of position encoding
        Returns:
            paddle.Tensor: Corresponding position encoding
        """
        raise NotImplementedError("position_encoding method is not implemented")


class NoPositionalEncoding(nn.Layer, PositionalEncodingInterface):
H
Hui Zhang 已提交
56 57 58 59 60
    def __init__(self,
                 d_model: int,
                 dropout_rate: float,
                 max_len: int=5000,
                 reverse: bool=False):
61
        nn.Layer.__init__(self)
H
Hui Zhang 已提交
62 63 64 65 66 67 68

    def forward(self, x: paddle.Tensor,
                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
        return x, None

    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
        return None
H
Hui Zhang 已提交
69

70

71
class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
72 73 74 75 76 77 78 79 80 81 82 83 84 85
    def __init__(self,
                 d_model: int,
                 dropout_rate: float,
                 max_len: int=5000,
                 reverse: bool=False):
        """Positional encoding.
            PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
            PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
        Args:
            d_model (int): embedding dim.
            dropout_rate (float): dropout rate.
            max_len (int, optional): maximum input length. Defaults to 5000.
            reverse (bool, optional): Not used. Defaults to False.
        """
86
        nn.Layer.__init__(self)
87 88 89 90
        self.d_model = d_model
        self.max_len = max_len
        self.xscale = paddle.to_tensor(math.sqrt(self.d_model))
        self.dropout = nn.Dropout(p=dropout_rate)
91
        self.pe = paddle.zeros([self.max_len, self.d_model])  #[T,D]
92 93

        position = paddle.arange(
94
            0, self.max_len, dtype=paddle.float32).unsqueeze(1)  #[T, 1]
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
        div_term = paddle.exp(
            paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
            -(math.log(10000.0) / self.d_model))

        self.pe[:, 0::2] = paddle.sin(position * div_term)
        self.pe[:, 1::2] = paddle.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)  #[1, T, D]

    def forward(self, x: paddle.Tensor,
                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Add positional encoding.
        Args:
            x (paddle.Tensor): Input. Its shape is (batch, time, ...)
            offset (int): position offset
        Returns:
            paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
111
            paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
112
        """
113
        T = x.shape[1]
H
Hui Zhang 已提交
114
        assert offset + x.shape[1] < self.max_len
115
        #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
        pos_emb = self.pe[:, offset:offset + T]
        x = x * self.xscale + pos_emb
        return self.dropout(x), self.dropout(pos_emb)

    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
        """ For getting encoding in a streaming fashion
        Attention!!!!!
        we apply dropout only once at the whole utterance level in a none
        streaming way, but will call this function several times with
        increasing input size in a streaming scenario, so the dropout will
        be applied several times.
        Args:
            offset (int): start offset
            size (int): requried size of position encoding
        Returns:
131
            paddle.Tensor: Corresponding position encoding
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
        """
        assert offset + size < self.max_len
        return self.dropout(self.pe[:, offset:offset + size])


class RelPositionalEncoding(PositionalEncoding):
    """Relative positional encoding module.
    See : Appendix B in https://arxiv.org/abs/1901.02860
    """

    def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
        """
        Args:
            d_model (int): Embedding dimension.
            dropout_rate (float): Dropout rate.
            max_len (int, optional): [Maximum input length.]. Defaults to 5000.
        """
        super().__init__(d_model, dropout_rate, max_len, reverse=True)

    def forward(self, x: paddle.Tensor,
                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute positional encoding.
        Args:
            x (paddle.Tensor): Input tensor (batch, time, `*`).
        Returns:
            paddle.Tensor: Encoded tensor (batch, time, `*`).
            paddle.Tensor: Positional embedding tensor (1, time, `*`).
        """
H
Hui Zhang 已提交
160
        assert offset + x.shape[1] < self.max_len
161
        x = x * self.xscale
162 163
        #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
        pos_emb = self.pe[:, offset:offset + x.shape[1]]
164
        return self.dropout(x), self.dropout(pos_emb)