rope for streaming decoding

0a5cc555 · Hui Zhang · b56fb85c · 0a5cc555 · 0a5cc555 · 0a5cc555
4 changed file
--- a/examples/aishell/asr1/RESULTS.md
+++ b/examples/aishell/asr1/RESULTS.md
@@ -4,7 +4,10 @@
 paddle version: 2.5.0  
 paddlespeech version: 1.5.0
-Need set `decoding.decoding_chunk_size=16` when decoding.
+Tesla V100-SXM2-32GB: 1 node, 4 card
+Global BachSize: 32 * 4
+Training Done: 1 day, 12:56:39.639646
+### `decoding.decoding_chunk_size=16`
 > chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms
@@ -15,15 +18,14 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - |  |  
 | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - |  |  
-## Conformer
+### `decoding.decoding_chunk_size=-1`
-paddle version: 2.2.2  
-paddlespeech version: 1.0.1
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |  
-| --- | --- | --- | --- | --- | --- | --- | --- | 
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 |  
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - |  5.51 |  
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 | 
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | 
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 |  - | 4.99 |  
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | 
 ## Conformer Streaming
@@ -39,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - | 0.051968 |  
+## Conformer
+paddle version: 2.2.2  
+paddlespeech version: 1.0.1
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | 
 ## Transformer 
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |  

--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -15,8 +15,8 @@
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Multi-Head Attention layer definition."""
 import math
-from typing import Tuple
 from typing import List
+from typing import Tuple
 import paddle
 from paddle import nn
@@ -428,7 +428,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
        # (B,H,T,D)
        ndim = tensors[0].dim()
-        _,H,T,D = tensors[0].shape
+        _, H, T, D = tensors[0].shape
        # sinusoidal shape same with tensors[0]
        # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H)
@@ -476,6 +476,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`
        """
        q, k, v = self.forward_qkv(query, key, value)
        # q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
@@ -504,7 +505,12 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
        new_cache = paddle.concat((k, v), axis=-1)
        # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index
-        q, k = self.apply_rotary_position_embeddings(pos_emb, q, k)
+        # q_t always is chunk_size
+        q_t = q.shape[2]
+        q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q)
+        # k will increase when in streaming decoding.
+        k = self.apply_rotary_position_embeddings(pos_emb, k)
        # dot(q, k)
        scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k)
        return self.forward_attention(v, scores, mask), new_cache
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -164,6 +164,7 @@ class RelPositionalEncoding(PositionalEncoding):
        assert offset + x.shape[
            1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                offset, x.shape[1], self.max_len)
        x = x * self.xscale
        pos_emb = self.pe[:, offset:offset + x.shape[1]]
        return self.dropout(x), self.dropout(pos_emb)

--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -233,7 +233,7 @@ class BaseEncoder(nn.Layer):
            xs = self.global_cmvn(xs)
        # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+        xs, _, _ = self.embed(xs, tmp_masks, offset=offset)
        # after embed, xs=(B=1, chunk_size, hidden-dim)
        elayers, _, cache_t1, _ = att_cache.shape