diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index a812abcbdca069dbeab1aa9515611476e0dd4bfe..813e1e5299053574a86c6f08f5ce8e066cfe566c 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, + att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index cbcaccc26e2fe3c4cab59852b7098070ff24c787..92990048d163a84a37532cd1f96f5ef0448a4697 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -86,7 +86,8 @@ class MultiHeadedAttention(nn.Layer): self, value: paddle.Tensor, scores: paddle.Tensor, - mask: paddle.Tensor, ) -> paddle.Tensor: + mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + ) -> paddle.Tensor: """Compute attention context vector. Args: value (paddle.Tensor): Transformed value, size @@ -126,13 +127,15 @@ class MultiHeadedAttention(nn.Layer): return self.linear_out(x) # (batch, time1, d_model) - def forward(self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, - pos_emb: paddle.Tensor, - cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward( + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + pos_emb: paddle.Tensor, # paddle.empty([0]) + cache: paddle.Tensor # paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute scaled dot product attention. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). @@ -241,13 +244,15 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): return x - def forward(self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, - pos_emb: paddle.Tensor, - cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward( + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + pos_emb: paddle.Tensor, # paddle.empty([0]) + cache: paddle.Tensor # paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index 23aecd7f1360855eeb33ef2b7a0effd8d79363af..b35fea5b9019973f48173d883ead54fe83c8bb31 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -105,10 +105,12 @@ class ConvolutionModule(nn.Layer): ) self.activation = activation - def forward(self, - x: paddle.Tensor, - mask_pad: paddle.Tensor, - cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward( + self, + x: paddle.Tensor, + mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + cache: paddle.Tensor # paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute convolution module. Args: x (paddle.Tensor): Input tensor (#batch, time, channels). diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 6001afd4b92289163116f8ff44b187638d4cd112..abdaf5ea7d6507336e8459b288cb3b9ffb6a88fb 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -190,9 +190,9 @@ class BaseEncoder(nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, - att_mask: paddle.Tensor, + att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]), + att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Forward just one chunk Args: diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 8fd991ec65a6858de781179b7b89b274ac0c14ba..3972ff90afea244b3f82fa86c4166ad5c4639565 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -76,9 +76,10 @@ class TransformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, + mask_pad: paddle. + Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: @@ -194,9 +195,10 @@ class ConformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor, - att_cache: paddle.Tensor, - cnn_cache: paddle.Tensor, + mask_pad: paddle. + Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) + att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: