Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
418d85ef
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
418d85ef
编写于
10月 25, 2021
作者:
H
huangyuxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix some bug and complete the recog.py
上级
e4a9328c
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
48 addition
and
51 deletion
+48
-51
deepspeech/decoders/recog.py
deepspeech/decoders/recog.py
+10
-13
deepspeech/models/lm/transformer.py
deepspeech/models/lm/transformer.py
+32
-33
deepspeech/modules/encoder.py
deepspeech/modules/encoder.py
+6
-5
未找到文件。
deepspeech/decoders/recog.py
浏览文件 @
418d85ef
...
@@ -28,8 +28,8 @@ from .utils import add_results_to_json
...
@@ -28,8 +28,8 @@ from .utils import add_results_to_json
from
deepspeech.exps
import
dynamic_import_tester
from
deepspeech.exps
import
dynamic_import_tester
from
deepspeech.io.reader
import
LoadInputsAndTargets
from
deepspeech.io.reader
import
LoadInputsAndTargets
from
deepspeech.models.asr_interface
import
ASRInterface
from
deepspeech.models.asr_interface
import
ASRInterface
from
deepspeech.utils.log
import
Log
from
deepspeech.models.lm.transformer
import
TransformerLM
from
deepspeech.models.lm.transformer
import
TransformerLM
from
deepspeech.utils.log
import
Log
# from espnet.asr.asr_utils import get_model_conf
# from espnet.asr.asr_utils import get_model_conf
# from espnet.asr.asr_utils import torch_load
# from espnet.asr.asr_utils import torch_load
# from espnet.nets.lm_interface import dynamic_import_lm
# from espnet.nets.lm_interface import dynamic_import_lm
...
@@ -80,8 +80,7 @@ def recog_v2(args):
...
@@ -80,8 +80,7 @@ def recog_v2(args):
sort_in_input_length
=
False
,
sort_in_input_length
=
False
,
preprocess_conf
=
confs
.
collator
.
augmentation_config
preprocess_conf
=
confs
.
collator
.
augmentation_config
if
args
.
preprocess_conf
is
None
else
args
.
preprocess_conf
,
if
args
.
preprocess_conf
is
None
else
args
.
preprocess_conf
,
preprocess_args
=
{
"train"
:
False
},
preprocess_args
=
{
"train"
:
False
},
)
)
if
args
.
rnnlm
:
if
args
.
rnnlm
:
lm_path
=
args
.
rnnlm
lm_path
=
args
.
rnnlm
...
@@ -120,8 +119,7 @@ def recog_v2(args):
...
@@ -120,8 +119,7 @@ def recog_v2(args):
ctc
=
args
.
ctc_weight
,
ctc
=
args
.
ctc_weight
,
lm
=
args
.
lm_weight
,
lm
=
args
.
lm_weight
,
ngram
=
args
.
ngram_weight
,
ngram
=
args
.
ngram_weight
,
length_bonus
=
args
.
penalty
,
length_bonus
=
args
.
penalty
,
)
)
beam_search
=
BeamSearch
(
beam_search
=
BeamSearch
(
beam_size
=
args
.
beam_size
,
beam_size
=
args
.
beam_size
,
vocab_size
=
len
(
char_list
),
vocab_size
=
len
(
char_list
),
...
@@ -130,8 +128,7 @@ def recog_v2(args):
...
@@ -130,8 +128,7 @@ def recog_v2(args):
sos
=
model
.
sos
,
sos
=
model
.
sos
,
eos
=
model
.
eos
,
eos
=
model
.
eos
,
token_list
=
char_list
,
token_list
=
char_list
,
pre_beam_score_key
=
None
if
args
.
ctc_weight
==
1.0
else
"full"
,
pre_beam_score_key
=
None
if
args
.
ctc_weight
==
1.0
else
"full"
,
)
)
# TODO(karita): make all scorers batchfied
# TODO(karita): make all scorers batchfied
if
args
.
batchsize
==
1
:
if
args
.
batchsize
==
1
:
...
@@ -178,7 +175,8 @@ def recog_v2(args):
...
@@ -178,7 +175,8 @@ def recog_v2(args):
logger
.
info
(
f
'feat:
{
feat
.
shape
}
'
)
logger
.
info
(
f
'feat:
{
feat
.
shape
}
'
)
enc
=
model
.
encode
(
paddle
.
to_tensor
(
feat
).
to
(
dtype
))
enc
=
model
.
encode
(
paddle
.
to_tensor
(
feat
).
to
(
dtype
))
logger
.
info
(
f
'eout:
{
enc
.
shape
}
'
)
logger
.
info
(
f
'eout:
{
enc
.
shape
}
'
)
nbest_hyps
=
beam_search
(
x
=
enc
,
nbest_hyps
=
beam_search
(
x
=
enc
,
maxlenratio
=
args
.
maxlenratio
,
maxlenratio
=
args
.
maxlenratio
,
minlenratio
=
args
.
minlenratio
)
minlenratio
=
args
.
minlenratio
)
nbest_hyps
=
[
nbest_hyps
=
[
...
@@ -190,9 +188,8 @@ def recog_v2(args):
...
@@ -190,9 +188,8 @@ def recog_v2(args):
item
=
new_js
[
name
][
'output'
][
0
]
# 1-best
item
=
new_js
[
name
][
'output'
][
0
]
# 1-best
ref
=
item
[
'text'
]
ref
=
item
[
'text'
]
rec_text
=
item
[
'rec_text'
].
replace
(
'▁'
,
rec_text
=
item
[
'rec_text'
].
replace
(
'▁'
,
' '
).
replace
(
' '
).
replace
(
'<eos>'
,
'<eos>'
,
''
).
strip
()
''
).
strip
()
rec_tokenid
=
list
(
map
(
int
,
item
[
'rec_tokenid'
].
split
()))
rec_tokenid
=
list
(
map
(
int
,
item
[
'rec_tokenid'
].
split
()))
f
.
write
({
f
.
write
({
"utt"
:
name
,
"utt"
:
name
,
...
...
deepspeech/models/lm/transformer.py
浏览文件 @
418d85ef
...
@@ -11,6 +11,7 @@
...
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
logging
from
typing
import
Any
from
typing
import
Any
from
typing
import
List
from
typing
import
List
from
typing
import
Tuple
from
typing
import
Tuple
...
@@ -20,12 +21,12 @@ import paddle
...
@@ -20,12 +21,12 @@ import paddle
import
paddle.nn
as
nn
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
import
paddle.nn.functional
as
F
from
deepspeech.modules.mask
import
subsequent_mask
from
deepspeech.modules.encoder
import
TransformerEncoder
from
deepspeech.decoders.scorers.scorer_interface
import
BatchScorerInterface
from
deepspeech.decoders.scorers.scorer_interface
import
BatchScorerInterface
from
deepspeech.models.lm_interface
import
LMInterface
from
deepspeech.models.lm_interface
import
LMInterface
from
deepspeech.modules.encoder
import
TransformerEncoder
from
deepspeech.modules.mask
import
subsequent_mask
import
logging
class
TransformerLM
(
nn
.
Layer
,
LMInterface
,
BatchScorerInterface
):
class
TransformerLM
(
nn
.
Layer
,
LMInterface
,
BatchScorerInterface
):
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -37,9 +38,9 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
...
@@ -37,9 +38,9 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
unit
:
int
=
1024
,
unit
:
int
=
1024
,
layer
:
int
=
4
,
layer
:
int
=
4
,
dropout_rate
:
float
=
0.5
,
dropout_rate
:
float
=
0.5
,
emb_dropout_rate
:
float
=
0.0
,
emb_dropout_rate
:
float
=
0.0
,
att_dropout_rate
:
float
=
0.0
,
att_dropout_rate
:
float
=
0.0
,
tie_weights
:
bool
=
False
,
):
tie_weights
:
bool
=
False
,
):
nn
.
Layer
.
__init__
(
self
)
nn
.
Layer
.
__init__
(
self
)
if
pos_enc
==
"sinusoidal"
:
if
pos_enc
==
"sinusoidal"
:
...
@@ -84,15 +85,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
...
@@ -84,15 +85,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
),
"Tie Weights: True need embedding and final dimensions to match"
),
"Tie Weights: True need embedding and final dimensions to match"
self
.
decoder
.
weight
=
self
.
embed
.
weight
self
.
decoder
.
weight
=
self
.
embed
.
weight
def
_target_mask
(
self
,
ys_in_pad
):
def
_target_mask
(
self
,
ys_in_pad
):
ys_mask
=
ys_in_pad
!=
0
ys_mask
=
ys_in_pad
!=
0
m
=
subsequent_mask
(
ys_mask
.
size
(
-
1
)).
unsqueeze
(
0
)
m
=
subsequent_mask
(
ys_mask
.
size
(
-
1
)).
unsqueeze
(
0
)
return
ys_mask
.
unsqueeze
(
-
2
)
&
m
return
ys_mask
.
unsqueeze
(
-
2
)
&
m
def
forward
(
def
forward
(
self
,
x
:
paddle
.
Tensor
,
t
:
paddle
.
Tensor
self
,
x
:
paddle
.
Tensor
,
t
:
paddle
.
Tensor
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
"""Compute LM loss value from buffer sequences.
"""Compute LM loss value from buffer sequences.
...
@@ -119,7 +117,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
...
@@ -119,7 +117,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb
=
self
.
embed
(
x
)
emb
=
self
.
embed
(
x
)
h
,
_
=
self
.
encoder
(
emb
,
xlen
)
h
,
_
=
self
.
encoder
(
emb
,
xlen
)
y
=
self
.
decoder
(
h
)
y
=
self
.
decoder
(
h
)
loss
=
F
.
cross_entropy
(
y
.
view
(
-
1
,
y
.
shape
[
-
1
]),
t
.
view
(
-
1
),
reduction
=
"none"
)
loss
=
F
.
cross_entropy
(
y
.
view
(
-
1
,
y
.
shape
[
-
1
]),
t
.
view
(
-
1
),
reduction
=
"none"
)
mask
=
xm
.
to
(
dtype
=
loss
.
dtype
)
mask
=
xm
.
to
(
dtype
=
loss
.
dtype
)
logp
=
loss
*
mask
.
view
(
-
1
)
logp
=
loss
*
mask
.
view
(
-
1
)
logp
=
logp
.
sum
()
logp
=
logp
.
sum
()
...
@@ -150,16 +149,16 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
...
@@ -150,16 +149,16 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb
=
self
.
embed
(
y
)
emb
=
self
.
embed
(
y
)
h
,
_
,
cache
=
self
.
encoder
.
forward_one_step
(
h
,
_
,
cache
=
self
.
encoder
.
forward_one_step
(
emb
,
self
.
_target_mask
(
y
),
cache
=
state
emb
,
self
.
_target_mask
(
y
),
cache
=
state
)
)
h
=
self
.
decoder
(
h
[:,
-
1
])
h
=
self
.
decoder
(
h
[:,
-
1
])
logp
=
F
.
log_softmax
(
h
).
squeeze
(
0
)
logp
=
F
.
log_softmax
(
h
).
squeeze
(
0
)
return
logp
,
cache
return
logp
,
cache
# batch beam search API (see BatchScorerInterface)
# batch beam search API (see BatchScorerInterface)
def
batch_score
(
def
batch_score
(
self
,
self
,
ys
:
paddle
.
Tensor
,
states
:
List
[
Any
],
xs
:
paddle
.
Tensor
ys
:
paddle
.
Tensor
,
)
->
Tuple
[
paddle
.
Tensor
,
List
[
Any
]]:
states
:
List
[
Any
],
xs
:
paddle
.
Tensor
)
->
Tuple
[
paddle
.
Tensor
,
List
[
Any
]]:
"""Score new token batch (required).
"""Score new token batch (required).
Args:
Args:
...
@@ -193,13 +192,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
...
@@ -193,13 +192,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
# batch decoding
# batch decoding
h
,
_
,
states
=
self
.
encoder
.
forward_one_step
(
h
,
_
,
states
=
self
.
encoder
.
forward_one_step
(
emb
,
self
.
_target_mask
(
ys
),
cache
=
batch_state
emb
,
self
.
_target_mask
(
ys
),
cache
=
batch_state
)
)
h
=
self
.
decoder
(
h
[:,
-
1
])
h
=
self
.
decoder
(
h
[:,
-
1
])
logp
=
F
.
log_softmax
(
h
)
logp
=
F
.
log_softmax
(
h
)
# transpose state of [layer, batch] into [batch, layer]
# transpose state of [layer, batch] into [batch, layer]
state_list
=
[[
states
[
i
][
b
]
for
i
in
range
(
n_layers
)]
for
b
in
range
(
n_batch
)]
state_list
=
[[
states
[
i
][
b
]
for
i
in
range
(
n_layers
)]
for
b
in
range
(
n_batch
)]
return
logp
,
state_list
return
logp
,
state_list
...
@@ -236,11 +235,11 @@ if __name__ == "__main__":
...
@@ -236,11 +235,11 @@ if __name__ == "__main__":
state
=
None
state
=
None
output
,
state
=
tlm
.
score
(
input2
,
state
,
None
)
output
,
state
=
tlm
.
score
(
input2
,
state
,
None
)
input3
=
np
.
array
([
5
,
10
])
input3
=
np
.
array
([
5
,
10
])
input3
=
paddle
.
to_tensor
(
input3
)
input3
=
paddle
.
to_tensor
(
input3
)
output
,
state
=
tlm
.
score
(
input3
,
state
,
None
)
output
,
state
=
tlm
.
score
(
input3
,
state
,
None
)
input4
=
np
.
array
([
5
,
10
,
0
])
input4
=
np
.
array
([
5
,
10
,
0
])
input4
=
paddle
.
to_tensor
(
input4
)
input4
=
paddle
.
to_tensor
(
input4
)
output
,
state
=
tlm
.
score
(
input4
,
state
,
None
)
output
,
state
=
tlm
.
score
(
input4
,
state
,
None
)
print
(
"output"
,
output
)
print
(
"output"
,
output
)
...
...
deepspeech/modules/encoder.py
浏览文件 @
418d85ef
...
@@ -24,9 +24,9 @@ from deepspeech.modules.activation import get_activation
...
@@ -24,9 +24,9 @@ from deepspeech.modules.activation import get_activation
from
deepspeech.modules.attention
import
MultiHeadedAttention
from
deepspeech.modules.attention
import
MultiHeadedAttention
from
deepspeech.modules.attention
import
RelPositionMultiHeadedAttention
from
deepspeech.modules.attention
import
RelPositionMultiHeadedAttention
from
deepspeech.modules.conformer_convolution
import
ConvolutionModule
from
deepspeech.modules.conformer_convolution
import
ConvolutionModule
from
deepspeech.modules.embedding
import
NoPositionalEncoding
from
deepspeech.modules.embedding
import
PositionalEncoding
from
deepspeech.modules.embedding
import
PositionalEncoding
from
deepspeech.modules.embedding
import
RelPositionalEncoding
from
deepspeech.modules.embedding
import
RelPositionalEncoding
from
deepspeech.modules.embedding
import
NoPositionalEncoding
from
deepspeech.modules.encoder_layer
import
ConformerEncoderLayer
from
deepspeech.modules.encoder_layer
import
ConformerEncoderLayer
from
deepspeech.modules.encoder_layer
import
TransformerEncoderLayer
from
deepspeech.modules.encoder_layer
import
TransformerEncoderLayer
from
deepspeech.modules.mask
import
add_optional_chunk_mask
from
deepspeech.modules.mask
import
add_optional_chunk_mask
...
@@ -378,8 +378,7 @@ class TransformerEncoder(BaseEncoder):
...
@@ -378,8 +378,7 @@ class TransformerEncoder(BaseEncoder):
self
,
self
,
xs
:
paddle
.
Tensor
,
xs
:
paddle
.
Tensor
,
masks
:
paddle
.
Tensor
,
masks
:
paddle
.
Tensor
,
cache
=
None
,
cache
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
"""Encode input frame.
"""Encode input frame.
Args:
Args:
...
@@ -397,9 +396,11 @@ class TransformerEncoder(BaseEncoder):
...
@@ -397,9 +396,11 @@ class TransformerEncoder(BaseEncoder):
if
isinstance
(
self
.
embed
,
Conv2dSubsampling
):
if
isinstance
(
self
.
embed
,
Conv2dSubsampling
):
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
else
:
else
:
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks
=
masks
.
astype
(
paddle
.
bool
)
masks
=
masks
.
astype
(
paddle
.
bool
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录