Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
418d85ef
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
418d85ef
编写于
10月 25, 2021
作者:
H
huangyuxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix some bug and complete the recog.py
上级
e4a9328c
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
48 addition
and
51 deletion
+48
-51
deepspeech/decoders/recog.py
deepspeech/decoders/recog.py
+10
-13
deepspeech/models/lm/transformer.py
deepspeech/models/lm/transformer.py
+32
-33
deepspeech/modules/encoder.py
deepspeech/modules/encoder.py
+6
-5
未找到文件。
deepspeech/decoders/recog.py
浏览文件 @
418d85ef
...
...
@@ -28,8 +28,8 @@ from .utils import add_results_to_json
from
deepspeech.exps
import
dynamic_import_tester
from
deepspeech.io.reader
import
LoadInputsAndTargets
from
deepspeech.models.asr_interface
import
ASRInterface
from
deepspeech.utils.log
import
Log
from
deepspeech.models.lm.transformer
import
TransformerLM
from
deepspeech.utils.log
import
Log
# from espnet.asr.asr_utils import get_model_conf
# from espnet.asr.asr_utils import torch_load
# from espnet.nets.lm_interface import dynamic_import_lm
...
...
@@ -80,8 +80,7 @@ def recog_v2(args):
sort_in_input_length
=
False
,
preprocess_conf
=
confs
.
collator
.
augmentation_config
if
args
.
preprocess_conf
is
None
else
args
.
preprocess_conf
,
preprocess_args
=
{
"train"
:
False
},
)
preprocess_args
=
{
"train"
:
False
},
)
if
args
.
rnnlm
:
lm_path
=
args
.
rnnlm
...
...
@@ -120,8 +119,7 @@ def recog_v2(args):
ctc
=
args
.
ctc_weight
,
lm
=
args
.
lm_weight
,
ngram
=
args
.
ngram_weight
,
length_bonus
=
args
.
penalty
,
)
length_bonus
=
args
.
penalty
,
)
beam_search
=
BeamSearch
(
beam_size
=
args
.
beam_size
,
vocab_size
=
len
(
char_list
),
...
...
@@ -130,8 +128,7 @@ def recog_v2(args):
sos
=
model
.
sos
,
eos
=
model
.
eos
,
token_list
=
char_list
,
pre_beam_score_key
=
None
if
args
.
ctc_weight
==
1.0
else
"full"
,
)
pre_beam_score_key
=
None
if
args
.
ctc_weight
==
1.0
else
"full"
,
)
# TODO(karita): make all scorers batchfied
if
args
.
batchsize
==
1
:
...
...
@@ -178,9 +175,10 @@ def recog_v2(args):
logger
.
info
(
f
'feat:
{
feat
.
shape
}
'
)
enc
=
model
.
encode
(
paddle
.
to_tensor
(
feat
).
to
(
dtype
))
logger
.
info
(
f
'eout:
{
enc
.
shape
}
'
)
nbest_hyps
=
beam_search
(
x
=
enc
,
maxlenratio
=
args
.
maxlenratio
,
minlenratio
=
args
.
minlenratio
)
nbest_hyps
=
beam_search
(
x
=
enc
,
maxlenratio
=
args
.
maxlenratio
,
minlenratio
=
args
.
minlenratio
)
nbest_hyps
=
[
h
.
asdict
()
for
h
in
nbest_hyps
[:
min
(
len
(
nbest_hyps
),
args
.
nbest
)]
...
...
@@ -190,9 +188,8 @@ def recog_v2(args):
item
=
new_js
[
name
][
'output'
][
0
]
# 1-best
ref
=
item
[
'text'
]
rec_text
=
item
[
'rec_text'
].
replace
(
'▁'
,
' '
).
replace
(
'<eos>'
,
''
).
strip
()
rec_text
=
item
[
'rec_text'
].
replace
(
'▁'
,
' '
).
replace
(
'<eos>'
,
''
).
strip
()
rec_tokenid
=
list
(
map
(
int
,
item
[
'rec_tokenid'
].
split
()))
f
.
write
({
"utt"
:
name
,
...
...
deepspeech/models/lm/transformer.py
浏览文件 @
418d85ef
...
...
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
from
typing
import
Any
from
typing
import
List
from
typing
import
Tuple
...
...
@@ -20,12 +21,12 @@ import paddle
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
deepspeech.modules.mask
import
subsequent_mask
from
deepspeech.modules.encoder
import
TransformerEncoder
from
deepspeech.decoders.scorers.scorer_interface
import
BatchScorerInterface
from
deepspeech.models.lm_interface
import
LMInterface
from
deepspeech.modules.encoder
import
TransformerEncoder
from
deepspeech.modules.mask
import
subsequent_mask
import
logging
class
TransformerLM
(
nn
.
Layer
,
LMInterface
,
BatchScorerInterface
):
def
__init__
(
self
,
...
...
@@ -37,9 +38,9 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
unit
:
int
=
1024
,
layer
:
int
=
4
,
dropout_rate
:
float
=
0.5
,
emb_dropout_rate
:
float
=
0.0
,
att_dropout_rate
:
float
=
0.0
,
tie_weights
:
bool
=
False
,
):
emb_dropout_rate
:
float
=
0.0
,
att_dropout_rate
:
float
=
0.0
,
tie_weights
:
bool
=
False
,
):
nn
.
Layer
.
__init__
(
self
)
if
pos_enc
==
"sinusoidal"
:
...
...
@@ -84,16 +85,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
),
"Tie Weights: True need embedding and final dimensions to match"
self
.
decoder
.
weight
=
self
.
embed
.
weight
def
_target_mask
(
self
,
ys_in_pad
):
ys_mask
=
ys_in_pad
!=
0
m
=
subsequent_mask
(
ys_mask
.
size
(
-
1
)).
unsqueeze
(
0
)
return
ys_mask
.
unsqueeze
(
-
2
)
&
m
def
forward
(
self
,
x
:
paddle
.
Tensor
,
t
:
paddle
.
Tensor
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
def
forward
(
self
,
x
:
paddle
.
Tensor
,
t
:
paddle
.
Tensor
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
"""Compute LM loss value from buffer sequences.
Args:
...
...
@@ -119,7 +117,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb
=
self
.
embed
(
x
)
h
,
_
=
self
.
encoder
(
emb
,
xlen
)
y
=
self
.
decoder
(
h
)
loss
=
F
.
cross_entropy
(
y
.
view
(
-
1
,
y
.
shape
[
-
1
]),
t
.
view
(
-
1
),
reduction
=
"none"
)
loss
=
F
.
cross_entropy
(
y
.
view
(
-
1
,
y
.
shape
[
-
1
]),
t
.
view
(
-
1
),
reduction
=
"none"
)
mask
=
xm
.
to
(
dtype
=
loss
.
dtype
)
logp
=
loss
*
mask
.
view
(
-
1
)
logp
=
logp
.
sum
()
...
...
@@ -150,16 +149,16 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb
=
self
.
embed
(
y
)
h
,
_
,
cache
=
self
.
encoder
.
forward_one_step
(
emb
,
self
.
_target_mask
(
y
),
cache
=
state
)
emb
,
self
.
_target_mask
(
y
),
cache
=
state
)
h
=
self
.
decoder
(
h
[:,
-
1
])
logp
=
F
.
log_softmax
(
h
).
squeeze
(
0
)
return
logp
,
cache
# batch beam search API (see BatchScorerInterface)
def
batch_score
(
self
,
ys
:
paddle
.
Tensor
,
states
:
List
[
Any
],
xs
:
paddle
.
Tensor
)
->
Tuple
[
paddle
.
Tensor
,
List
[
Any
]]:
def
batch_score
(
self
,
ys
:
paddle
.
Tensor
,
states
:
List
[
Any
],
xs
:
paddle
.
Tensor
)
->
Tuple
[
paddle
.
Tensor
,
List
[
Any
]]:
"""Score new token batch (required).
Args:
...
...
@@ -193,13 +192,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
# batch decoding
h
,
_
,
states
=
self
.
encoder
.
forward_one_step
(
emb
,
self
.
_target_mask
(
ys
),
cache
=
batch_state
)
emb
,
self
.
_target_mask
(
ys
),
cache
=
batch_state
)
h
=
self
.
decoder
(
h
[:,
-
1
])
logp
=
F
.
log_softmax
(
h
)
# transpose state of [layer, batch] into [batch, layer]
state_list
=
[[
states
[
i
][
b
]
for
i
in
range
(
n_layers
)]
for
b
in
range
(
n_batch
)]
state_list
=
[[
states
[
i
][
b
]
for
i
in
range
(
n_layers
)]
for
b
in
range
(
n_batch
)]
return
logp
,
state_list
...
...
@@ -214,17 +213,17 @@ if __name__ == "__main__":
layer
=
16
,
dropout_rate
=
0.5
,
)
# n_vocab: int,
# pos_enc: str=None,
# embed_unit: int=128,
# att_unit: int=256,
# head: int=2,
# unit: int=1024,
# layer: int=4,
# dropout_rate: float=0.5,
# emb_dropout_rate: float = 0.0,
# att_dropout_rate: float = 0.0,
# tie_weights: bool = False,):
# n_vocab: int,
# pos_enc: str=None,
# embed_unit: int=128,
# att_unit: int=256,
# head: int=2,
# unit: int=1024,
# layer: int=4,
# dropout_rate: float=0.5,
# emb_dropout_rate: float = 0.0,
# att_dropout_rate: float = 0.0,
# tie_weights: bool = False,):
paddle
.
set_device
(
"cpu"
)
model_dict
=
paddle
.
load
(
"transformerLM.pdparams"
)
tlm
.
set_state_dict
(
model_dict
)
...
...
@@ -236,11 +235,11 @@ if __name__ == "__main__":
state
=
None
output
,
state
=
tlm
.
score
(
input2
,
state
,
None
)
input3
=
np
.
array
([
5
,
10
])
input3
=
np
.
array
([
5
,
10
])
input3
=
paddle
.
to_tensor
(
input3
)
output
,
state
=
tlm
.
score
(
input3
,
state
,
None
)
input4
=
np
.
array
([
5
,
10
,
0
])
input4
=
np
.
array
([
5
,
10
,
0
])
input4
=
paddle
.
to_tensor
(
input4
)
output
,
state
=
tlm
.
score
(
input4
,
state
,
None
)
print
(
"output"
,
output
)
...
...
deepspeech/modules/encoder.py
浏览文件 @
418d85ef
...
...
@@ -24,9 +24,9 @@ from deepspeech.modules.activation import get_activation
from
deepspeech.modules.attention
import
MultiHeadedAttention
from
deepspeech.modules.attention
import
RelPositionMultiHeadedAttention
from
deepspeech.modules.conformer_convolution
import
ConvolutionModule
from
deepspeech.modules.embedding
import
NoPositionalEncoding
from
deepspeech.modules.embedding
import
PositionalEncoding
from
deepspeech.modules.embedding
import
RelPositionalEncoding
from
deepspeech.modules.embedding
import
NoPositionalEncoding
from
deepspeech.modules.encoder_layer
import
ConformerEncoderLayer
from
deepspeech.modules.encoder_layer
import
TransformerEncoderLayer
from
deepspeech.modules.mask
import
add_optional_chunk_mask
...
...
@@ -378,8 +378,7 @@ class TransformerEncoder(BaseEncoder):
self
,
xs
:
paddle
.
Tensor
,
masks
:
paddle
.
Tensor
,
cache
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
cache
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
"""Encode input frame.
Args:
...
...
@@ -397,9 +396,11 @@ class TransformerEncoder(BaseEncoder):
if
isinstance
(
self
.
embed
,
Conv2dSubsampling
):
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
else
:
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks
=
masks
.
astype
(
paddle
.
bool
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录