Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
5659bd23
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5659bd23
编写于
4月 06, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add u2 model
上级
498104b0
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
148 addition
and
80 deletion
+148
-80
deepspeech/__init__.py
deepspeech/__init__.py
+34
-0
deepspeech/models/u2.py
deepspeech/models/u2.py
+106
-77
deepspeech/modules/subsampling.py
deepspeech/modules/subsampling.py
+1
-1
deepspeech/utils/utility.py
deepspeech/utils/utility.py
+7
-2
未找到文件。
deepspeech/__init__.py
浏览文件 @
5659bd23
...
...
@@ -78,7 +78,32 @@ if not hasattr(paddle, 'cat'):
"override cat of paddle if exists or register, remove this when fixed!"
)
paddle
.
cat
=
cat
########### hcak paddle.Tensor #############
def
item
(
x
:
paddle
.
Tensor
):
if
x
.
dtype
==
paddle
.
fluid
.
core_avx
.
VarDesc
.
VarType
.
FP32
:
return
float
(
x
)
else
:
raise
ValueError
(
"not support"
)
if
not
hasattr
(
paddle
.
Tensor
,
'item'
):
logger
.
warn
(
"override item of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle
.
Tensor
.
item
=
item
def
func_long
(
x
:
paddle
.
Tensor
):
return
paddle
.
cast
(
x
,
paddle
.
long
)
if
not
hasattr
(
paddle
.
Tensor
,
'long'
):
logger
.
warn
(
"override long of paddle.Tensor if exists or register, remove this when fixed!"
)
paddle
.
Tensor
.
long
=
func_long
if
not
hasattr
(
paddle
.
Tensor
,
'numel'
):
logger
.
warn
(
"override numel of paddle.Tensor if exists or register, remove this when fixed!"
...
...
@@ -247,6 +272,15 @@ if not hasattr(paddle.Tensor, 'to'):
logger
.
warn
(
"register user to to paddle.Tensor, remove this when fixed!"
)
setattr
(
paddle
.
Tensor
,
'to'
,
to
)
def
func_float
(
x
:
paddle
.
Tensor
)
->
paddle
.
Tensor
:
return
x
.
astype
(
paddle
.
float
)
if
not
hasattr
(
paddle
.
Tensor
,
'float'
):
logger
.
warn
(
"register user float to paddle.Tensor, remove this when fixed!"
)
setattr
(
paddle
.
Tensor
,
'float'
,
func_float
)
########### hcak paddle.nn.functional #############
...
...
deepspeech/models/u2.py
浏览文件 @
5659bd23
...
...
@@ -53,14 +53,13 @@ from deepspeech.utils.ctc_utils import remove_duplicates_and_blank
logger
=
logging
.
getLogger
(
__name__
)
__all__
=
[
'U2
Model'
]
__all__
=
[
'U2
TransformerModel'
,
"U2ConformerModel"
]
class
U2Model
(
nn
.
Module
):
"""CTC-Attention hybrid Encoder-Decoder model"""
def
__init__
(
self
,
def
__init__
(
self
,
vocab_size
:
int
,
encoder
:
TransformerEncoder
,
decoder
:
TransformerDecoder
,
...
...
@@ -68,7 +67,7 @@ class U2Model(nn.Module):
ctc_weight
:
float
=
0.5
,
ignore_id
:
int
=
IGNORE_ID
,
lsm_weight
:
float
=
0.0
,
length_normalized_loss
:
bool
=
False
,
):
length_normalized_loss
:
bool
=
False
):
assert
0.0
<=
ctc_weight
<=
1.0
,
ctc_weight
super
().
__init__
()
...
...
@@ -263,51 +262,54 @@ class U2Model(nn.Module):
# Stop if all batch and all beam produce eos
if
end_flag
.
sum
()
==
running_size
:
break
# 2.1 Forward decoder step
hyps_mask
=
subsequent_mask
(
i
).
unsqueeze
(
0
).
repeat
(
running_size
,
1
,
1
).
to
(
device
)
# (B*N, i, i)
# logp: (B*N, vocab)
logp
,
cache
=
self
.
decoder
.
forward_one_step
(
encoder_out
,
encoder_mask
,
hyps
,
hyps_mask
,
cache
)
# 2.2 First beam prune: select topk best prob at current time
top_k_logp
,
top_k_index
=
logp
.
topk
(
beam_size
)
# (B*N, N)
top_k_logp
=
mask_finished_scores
(
top_k_logp
,
end_flag
)
top_k_index
=
mask_finished_preds
(
top_k_index
,
end_flag
,
self
.
eos
)
# 2.3 Seconde beam prune: select topk score with history
scores
=
scores
+
top_k_logp
# (B*N, N), broadcast add
scores
=
scores
.
view
(
batch_size
,
beam_size
*
beam_size
)
# (B, N*N)
scores
,
offset_k_index
=
scores
.
topk
(
k
=
beam_size
)
# (B, N)
scores
=
scores
.
view
(
-
1
,
1
)
# (B*N, 1)
# 2.4. Compute base index in top_k_index,
# regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
# then find offset_k_index in top_k_index
base_k_index
=
torch
.
arange
(
batch_size
,
device
=
device
).
view
(
-
1
,
1
).
repeat
([
1
,
beam_size
])
# (B, N)
base_k_index
=
paddle
.
arange
(
batch_size
).
view
(
-
1
,
1
).
repeat
(
[
1
,
beam_size
])
# (B, N)
base_k_index
=
base_k_index
*
beam_size
*
beam_size
best_k_index
=
base_k_index
.
view
(
-
1
)
+
offset_k_index
.
view
(
-
1
)
# (B*N)
# 2.5 Update best hyps
best_k_pred
=
torch
.
index_select
(
top_k_index
.
view
(
-
1
),
dim
=-
1
,
index
=
best_k_index
)
# (B*N)
best_k_pred
=
paddle
.
index_select
(
top_k_index
.
view
(
-
1
),
index
=
best_k_index
,
axis
=
0
)
# (B*N)
best_hyps_index
=
best_k_index
//
beam_size
last_best_k_hyps
=
torch
.
index_select
(
hyps
,
dim
=
0
,
index
=
best_hyps_index
)
# (B*N, i)
hyps
=
torch
.
cat
(
last_best_k_hyps
=
paddle
.
index_select
(
hyps
,
index
=
best_hyps_index
,
axis
=
0
)
# (B*N, i)
hyps
=
paddle
.
cat
(
(
last_best_k_hyps
,
best_k_pred
.
view
(
-
1
,
1
)),
dim
=
1
)
# (B*N, i+1)
# 2.6 Update end flag
end_flag
=
torch
.
eq
(
hyps
[:,
-
1
],
self
.
eos
).
view
(
-
1
,
1
)
end_flag
=
paddle
.
eq
(
hyps
[:,
-
1
],
self
.
eos
).
view
(
-
1
,
1
)
# 3. Select best of best
scores
=
scores
.
view
(
batch_size
,
beam_size
)
# TODO: length normalization
best_index
=
torch
.
argmax
(
scores
,
dim
=-
1
).
long
(
)
best_hyps_index
=
best_index
+
torch
.
arange
(
batch_size
,
dtype
=
torch
.
long
,
device
=
device
)
*
beam_size
best_hyps
=
torch
.
index_select
(
hyps
,
dim
=
0
,
index
=
best_hyps_index
)
best_index
=
paddle
.
argmax
(
scores
,
axis
=-
1
).
long
()
# (B
)
best_hyps_index
=
best_index
+
paddle
.
arange
(
batch_size
,
dtype
=
paddle
.
long
)
*
beam_size
best_hyps
=
paddle
.
index_select
(
hyps
,
index
=
best_hyps_index
,
axis
=
0
)
best_hyps
=
best_hyps
[:,
1
:]
return
best_hyps
...
...
@@ -346,8 +348,8 @@ class U2Model(nn.Module):
ctc_probs
=
self
.
ctc
.
log_softmax
(
encoder_out
)
# (B, maxlen, vocab_size)
topk_prob
,
topk_index
=
ctc_probs
.
topk
(
1
,
dim
=
2
)
# (B, maxlen, 1)
topk_index
=
topk_index
.
view
(
batch_size
,
maxlen
)
# (B, maxlen)
mask
=
make_pad_mask
(
encoder_out_lens
)
# (B, maxlen)
topk_index
=
topk_index
.
masked_fill_
(
mask
,
self
.
eos
)
# (B, maxlen)
pad_
mask
=
make_pad_mask
(
encoder_out_lens
)
# (B, maxlen)
topk_index
=
topk_index
.
masked_fill_
(
pad_
mask
,
self
.
eos
)
# (B, maxlen)
hyps
=
[
hyp
.
tolist
()
for
hyp
in
topk_index
]
hyps
=
[
remove_duplicates_and_blank
(
hyp
)
for
hyp
in
hyps
]
return
hyps
...
...
@@ -360,7 +362,7 @@ class U2Model(nn.Module):
decoding_chunk_size
:
int
=-
1
,
num_decoding_left_chunks
:
int
=-
1
,
simulate_streaming
:
bool
=
False
,
)
->
Tuple
[
List
[
List
[
in
t
]],
paddle
.
Tensor
]:
blank_id
:
int
=
0
,
)
->
Tuple
[
List
[
Tuple
[
int
,
floa
t
]],
paddle
.
Tensor
]:
""" CTC prefix beam search inner implementation
Args:
speech (paddle.Tensor): (batch, max_len, feat_dim)
...
...
@@ -374,7 +376,7 @@ class U2Model(nn.Module):
simulate_streaming (bool): whether do encoder forward in a
streaming fashion
Returns:
List[
List[int]]: nbest results
List[
Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
paddle.Tensor: encoder output, (1, max_len, encoder_dim),
it will be used for rescoring in attention rescoring mode
"""
...
...
@@ -406,7 +408,7 @@ class U2Model(nn.Module):
ps
=
logp
[
s
].
item
()
for
prefix
,
(
pb
,
pnb
)
in
cur_hyps
:
last
=
prefix
[
-
1
]
if
len
(
prefix
)
>
0
else
None
if
s
==
0
:
# blank
if
s
==
blank_id
:
# blank
n_pb
,
n_pnb
=
next_hyps
[
prefix
]
n_pb
=
log_add
([
n_pb
,
pb
+
ps
,
pnb
+
ps
])
next_hyps
[
prefix
]
=
(
n_pb
,
n_pnb
)
...
...
@@ -491,7 +493,7 @@ class U2Model(nn.Module):
"""
assert
speech
.
shape
[
0
]
==
speech_lengths
.
shape
[
0
]
assert
decoding_chunk_size
!=
0
device
=
speech
.
devi
ce
device
=
speech
.
pla
ce
batch_size
=
speech
.
shape
[
0
]
# For attention rescoring we only support batch_size=1
assert
batch_size
==
1
...
...
@@ -502,22 +504,22 @@ class U2Model(nn.Module):
assert
len
(
hyps
)
==
beam_size
hyps_pad
=
pad_sequence
([
paddle
.
t
ensor
(
hyp
[
0
],
device
=
device
,
dtype
=
torch
.
long
)
paddle
.
t
o_tensor
(
hyp
[
0
],
place
=
device
,
dtype
=
paddle
.
long
)
for
hyp
in
hyps
],
True
,
self
.
ignore_id
)
# (beam_size, max_hyps_len)
hyps_lens
=
paddle
.
tensor
(
[
len
(
hyp
[
0
])
for
hyp
in
hyps
],
devi
ce
=
device
,
dtype
=
torch
.
long
)
# (beam_size,)
hyps_lens
=
paddle
.
t
o_t
ensor
(
[
len
(
hyp
[
0
])
for
hyp
in
hyps
],
pla
ce
=
device
,
dtype
=
paddle
.
long
)
# (beam_size,)
hyps_pad
,
_
=
add_sos_eos
(
hyps_pad
,
self
.
sos
,
self
.
eos
,
self
.
ignore_id
)
hyps_lens
=
hyps_lens
+
1
# Add <sos> at begining
encoder_out
=
encoder_out
.
repeat
(
beam_size
,
1
,
1
)
encoder_mask
=
torch
.
ones
(
beam_size
,
1
,
encoder_out
.
size
(
1
),
dtype
=
torch
.
bool
,
device
=
device
)
encoder_mask
=
paddle
.
ones
(
beam_size
,
1
,
encoder_out
.
size
(
1
),
dtype
=
paddle
.
bool
)
decoder_out
,
_
=
self
.
decoder
(
encoder_out
,
encoder_mask
,
hyps_pad
,
hyps_lens
)
# (beam_size, max_hyps_len, vocab_size)
decoder_out
=
torch
.
nn
.
functional
.
log_softmax
(
decoder_out
,
dim
=-
1
)
decoder_out
=
decoder_out
.
cpu
().
numpy
()
decoder_out
=
paddle
.
nn
.
functional
.
log_softmax
(
decoder_out
,
dim
=-
1
)
decoder_out
=
decoder_out
.
numpy
()
# Only use decoder score for rescoring
best_score
=
-
float
(
'inf'
)
best_index
=
0
...
...
@@ -609,56 +611,83 @@ class U2Model(nn.Module):
hypothesis from ctc prefix beam search and one encoder output
Args:
hyps (paddle.Tensor): hyps from ctc prefix beam search, already
pad sos at the begining
hyps_lens (paddle.Tensor): length of each hyp in hyps
encoder_out (paddle.Tensor): corresponding encoder output
pad sos at the begining
, (B, T)
hyps_lens (paddle.Tensor): length of each hyp in hyps
, (B)
encoder_out (paddle.Tensor): corresponding encoder output
, (B=1, T, D)
Returns:
paddle.Tensor: decoder output
paddle.Tensor: decoder output
, (B, L)
"""
assert
encoder_out
.
size
(
0
)
==
1
num_hyps
=
hyps
.
size
(
0
)
assert
hyps_lens
.
size
(
0
)
==
num_hyps
encoder_out
=
encoder_out
.
repeat
(
num_hyps
,
1
,
1
)
encoder_mask
=
torch
.
ones
(
num_hyps
,
1
,
encoder_out
.
size
(
1
),
dtype
=
torch
.
bool
,
device
=
encoder_out
.
device
)
decoder_out
,
_
=
self
.
decoder
(
encoder_out
,
encoder_mask
,
hyps
,
hyps_lens
)
# (num_hyps, max_hyps_len, vocab_size)
decoder_out
=
torch
.
nn
.
functional
.
log_softmax
(
decoder_out
,
dim
=-
1
)
# (B, 1, T)
encoder_mask
=
paddle
.
ones
(
[
num_hyps
,
1
,
encoder_out
.
size
(
1
)],
dtype
=
paddle
.
bool
)
# (num_hyps, max_hyps_len, vocab_size)
decoder_out
,
_
=
self
.
decoder
(
encoder_out
,
encoder_mask
,
hyps
,
hyps_lens
)
decoder_out
=
paddle
.
nn
.
functional
.
log_softmax
(
decoder_out
,
dim
=-
1
)
return
decoder_out
def
init_asr_model
(
configs
):
class
U2TransformerModel
(
U2Model
):
def
__init__
(
configs
:
dict
):
if
configs
[
'cmvn_file'
]
is
not
None
:
mean
,
istd
=
load_cmvn
(
configs
[
'cmvn_file'
],
configs
[
'is_json_cmvn'
])
mean
,
istd
=
load_cmvn
(
configs
[
'cmvn_file'
],
configs
[
'is_json_cmvn'
])
global_cmvn
=
GlobalCMVN
(
torch
.
from_numpy
(
mean
).
float
(),
torch
.
from_numpy
(
istd
).
float
())
paddle
.
to_tensor
(
mean
).
float
(),
paddle
.
to_tensor
(
istd
).
float
())
else
:
global_cmvn
=
None
input_dim
=
configs
[
'input_dim'
]
vocab_size
=
configs
[
'output_dim'
]
encoder_type
=
configs
.
get
(
'encoder'
,
'con
former'
)
if
encoder_type
==
'conformer'
:
encoder
=
Con
formerEncoder
(
encoder_type
=
configs
.
get
(
'encoder'
,
'trans
former'
)
assert
encoder_type
==
'transformer'
encoder
=
Trans
formerEncoder
(
input_dim
,
global_cmvn
=
global_cmvn
,
**
configs
[
'encoder_conf'
])
decoder
=
TransformerDecoder
(
vocab_size
,
encoder
.
output_size
(),
**
configs
[
'decoder_conf'
])
ctc
=
CTCDecoder
(
vocab_size
,
encoder
.
output_size
())
self
.
__init__
(
vocab_size
=
vocab_size
,
encoder
=
encoder
,
decoder
=
decoder
,
ctc
=
ctc
,
**
configs
[
'model_conf'
])
class
U2ConformerModel
(
U2Model
):
def
__init__
(
configs
:
dict
):
if
configs
[
'cmvn_file'
]
is
not
None
:
mean
,
istd
=
load_cmvn
(
configs
[
'cmvn_file'
],
configs
[
'is_json_cmvn'
])
global_cmvn
=
GlobalCMVN
(
paddle
.
to_tensor
(
mean
).
float
(),
paddle
.
to_tensor
(
istd
).
float
())
else
:
encoder
=
TransformerEncoder
(
global_cmvn
=
None
input_dim
=
configs
[
'input_dim'
]
vocab_size
=
configs
[
'output_dim'
]
encoder_type
=
configs
.
get
(
'encoder'
,
'conformer'
)
assert
encoder_type
==
'conformer'
encoder
=
ConformerEncoder
(
input_dim
,
global_cmvn
=
global_cmvn
,
**
configs
[
'encoder_conf'
])
decoder
=
TransformerDecoder
(
vocab_size
,
encoder
.
output_size
(),
**
configs
[
'decoder_conf'
])
ctc
=
CTCDecoder
(
vocab_size
,
encoder
.
output_size
())
model
=
U2Model
(
self
.
__init__
(
vocab_size
=
vocab_size
,
encoder
=
encoder
,
decoder
=
decoder
,
ctc
=
ctc
,
**
configs
[
'model_conf'
],
)
return
model
**
configs
[
'model_conf'
])
deepspeech/modules/subsampling.py
浏览文件 @
5659bd23
...
...
@@ -37,7 +37,7 @@ class BaseSubsampling(nn.Layer):
self
.
pos_enc
=
pos_enc_class
# window size = (1 + right_context) + (chunk_size -1) * subsampling_rate
self
.
right_context
=
0
# stride =
chunk_size * subsampling_rat
e
# stride =
subsampling_rate * chunk_siz
e
self
.
subsampling_rate
=
1
def
position_encoding
(
self
,
offset
:
int
,
size
:
int
)
->
paddle
.
Tensor
:
...
...
deepspeech/utils/utility.py
浏览文件 @
5659bd23
...
...
@@ -63,8 +63,13 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
def
log_add
(
args
:
List
[
int
])
->
float
:
"""
Stable log add
"""Stable log add
Args:
args (List[int]): log scores
Returns:
float: sum of log scores
"""
if
all
(
a
==
-
float
(
'inf'
)
for
a
in
args
):
return
-
float
(
'inf'
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录