提交 0a5cc555 编写于 作者: H Hui Zhang

rope for streaming decoding

上级 b56fb85c
...@@ -4,7 +4,10 @@ ...@@ -4,7 +4,10 @@
paddle version: 2.5.0 paddle version: 2.5.0
paddlespeech version: 1.5.0 paddlespeech version: 1.5.0
Need set `decoding.decoding_chunk_size=16` when decoding. Tesla V100-SXM2-32GB: 1 node, 4 card
Global BachSize: 32 * 4
Training Done: 1 day, 12:56:39.639646
### `decoding.decoding_chunk_size=16`
> chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms > chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms
...@@ -15,15 +18,14 @@ Need set `decoding.decoding_chunk_size=16` when decoding. ...@@ -15,15 +18,14 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | |
| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | |
## Conformer ### `decoding.decoding_chunk_size=-1`
paddle version: 2.2.2
paddlespeech version: 1.0.1 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| --- | --- | --- | --- | --- | --- | --- | --- | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - | 5.51 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | | roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 | - | 4.99 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 |
## Conformer Streaming ## Conformer Streaming
...@@ -39,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding. ...@@ -39,6 +41,17 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.051968 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.051968 |
## Conformer
paddle version: 2.2.2
paddlespeech version: 1.0.1
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 |
## Transformer ## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
# Modified from wenet(https://github.com/wenet-e2e/wenet) # Modified from wenet(https://github.com/wenet-e2e/wenet)
"""Multi-Head Attention layer definition.""" """Multi-Head Attention layer definition."""
import math import math
from typing import Tuple
from typing import List from typing import List
from typing import Tuple
import paddle import paddle
from paddle import nn from paddle import nn
...@@ -428,7 +428,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention): ...@@ -428,7 +428,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
# (B,H,T,D) # (B,H,T,D)
ndim = tensors[0].dim() ndim = tensors[0].dim()
_,H,T,D = tensors[0].shape _, H, T, D = tensors[0].shape
# sinusoidal shape same with tensors[0] # sinusoidal shape same with tensors[0]
# [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H) # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H)
...@@ -476,6 +476,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention): ...@@ -476,6 +476,7 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
where `cache_t == chunk_size * num_decoding_left_chunks` where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size` and `head * d_k == size`
""" """
q, k, v = self.forward_qkv(query, key, value) q, k, v = self.forward_qkv(query, key, value)
# q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) # q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k)
...@@ -504,7 +505,12 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention): ...@@ -504,7 +505,12 @@ class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
new_cache = paddle.concat((k, v), axis=-1) new_cache = paddle.concat((k, v), axis=-1)
# f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index
q, k = self.apply_rotary_position_embeddings(pos_emb, q, k) # q_t always is chunk_size
q_t = q.shape[2]
q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q)
# k will increase when in streaming decoding.
k = self.apply_rotary_position_embeddings(pos_emb, k)
# dot(q, k) # dot(q, k)
scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k) scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask), new_cache return self.forward_attention(v, scores, mask), new_cache
...@@ -164,6 +164,7 @@ class RelPositionalEncoding(PositionalEncoding): ...@@ -164,6 +164,7 @@ class RelPositionalEncoding(PositionalEncoding):
assert offset + x.shape[ assert offset + x.shape[
1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
offset, x.shape[1], self.max_len) offset, x.shape[1], self.max_len)
x = x * self.xscale x = x * self.xscale
pos_emb = self.pe[:, offset:offset + x.shape[1]] pos_emb = self.pe[:, offset:offset + x.shape[1]]
return self.dropout(x), self.dropout(pos_emb) return self.dropout(x), self.dropout(pos_emb)
......
...@@ -233,7 +233,7 @@ class BaseEncoder(nn.Layer): ...@@ -233,7 +233,7 @@ class BaseEncoder(nn.Layer):
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
# before embed, xs=(B, T, D1), pos_emb=(B=1, T, D) # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) xs, _, _ = self.embed(xs, tmp_masks, offset=offset)
# after embed, xs=(B=1, chunk_size, hidden-dim) # after embed, xs=(B=1, chunk_size, hidden-dim)
elayers, _, cache_t1, _ = att_cache.shape elayers, _, cache_t1, _ = att_cache.shape
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册