Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
47dd61e5
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
47dd61e5
编写于
5月 31, 2022
作者:
H
huangyuxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor ds2, cli, server
上级
0fa32e4a
变更
27
显示空白变更内容
内联
并排
Showing
27 changed file
with
580 addition
and
1377 deletion
+580
-1377
examples/aishell/asr0/conf/deepspeech2.yaml
examples/aishell/asr0/conf/deepspeech2.yaml
+31
-28
examples/aishell/asr0/conf/deepspeech2_online.yaml
examples/aishell/asr0/conf/deepspeech2_online.yaml
+19
-20
examples/aishell/asr0/conf/tuning/decode.yaml
examples/aishell/asr0/conf/tuning/decode.yaml
+3
-3
examples/aishell/asr0/local/data.sh
examples/aishell/asr0/local/data.sh
+4
-3
examples/aishell/asr0/run.sh
examples/aishell/asr0/run.sh
+1
-2
examples/librispeech/asr0/conf/deepspeech2.yaml
examples/librispeech/asr0/conf/deepspeech2.yaml
+29
-29
examples/librispeech/asr0/conf/deepspeech2_online.yaml
examples/librispeech/asr0/conf/deepspeech2_online.yaml
+28
-31
examples/librispeech/asr0/local/data.sh
examples/librispeech/asr0/local/data.sh
+4
-3
examples/librispeech/asr0/local/test.sh
examples/librispeech/asr0/local/test.sh
+39
-10
examples/librispeech/asr0/run.sh
examples/librispeech/asr0/run.sh
+7
-3
paddlespeech/cli/asr/infer.py
paddlespeech/cli/asr/infer.py
+12
-27
paddlespeech/resource/model_alias.py
paddlespeech/resource/model_alias.py
+1
-1
paddlespeech/resource/pretrained_models.py
paddlespeech/resource/pretrained_models.py
+16
-16
paddlespeech/s2t/exps/deepspeech2/bin/export.py
paddlespeech/s2t/exps/deepspeech2/bin/export.py
+1
-4
paddlespeech/s2t/exps/deepspeech2/bin/test.py
paddlespeech/s2t/exps/deepspeech2/bin/test.py
+1
-4
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+0
-3
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+1
-10
paddlespeech/s2t/exps/deepspeech2/bin/train.py
paddlespeech/s2t/exps/deepspeech2/bin/train.py
+0
-3
paddlespeech/s2t/exps/deepspeech2/model.py
paddlespeech/s2t/exps/deepspeech2/model.py
+82
-93
paddlespeech/s2t/models/ds2/conv.py
paddlespeech/s2t/models/ds2/conv.py
+14
-152
paddlespeech/s2t/models/ds2/deepspeech2.py
paddlespeech/s2t/models/ds2/deepspeech2.py
+242
-92
paddlespeech/s2t/models/ds2/rnn.py
paddlespeech/s2t/models/ds2/rnn.py
+0
-315
paddlespeech/s2t/models/ds2_online/__init__.py
paddlespeech/s2t/models/ds2_online/__init__.py
+0
-31
paddlespeech/s2t/models/ds2_online/conv.py
paddlespeech/s2t/models/ds2_online/conv.py
+0
-33
paddlespeech/s2t/models/ds2_online/deepspeech2.py
paddlespeech/s2t/models/ds2_online/deepspeech2.py
+0
-397
paddlespeech/server/engine/asr/online/asr_engine.py
paddlespeech/server/engine/asr/online/asr_engine.py
+34
-56
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+11
-8
未找到文件。
examples/aishell/asr0/conf/deepspeech2.yaml
浏览文件 @
47dd61e5
...
@@ -15,50 +15,53 @@ max_output_input_ratio: .inf
...
@@ -15,50 +15,53 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
spm_model_prefix
:
'
'
random_seed
:
0
unit_type
:
'
char'
spm_model_prefix
:
preprocess_config
:
conf/preprocess.yaml
spectrum_type
:
linear
feat_dim
:
161
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
n_fft
:
None
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq
:
None
batch_size
:
64
target_sample_rate
:
16000
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
use_dB_normalization
:
True
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
target_dB
:
-20
minibatches
:
0
# for debug
dither
:
1.0
batch_count
:
auto
keep_transcription_text
:
False
batch_bins
:
0
sortagrad
:
True
batch_frames_in
:
0
shuffle_method
:
batch_shuffle
batch_frames_out
:
0
num_workers
:
2
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
num_conv_layers
:
2
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
5
rnn_layer_size
:
1024
rnn_layer_size
:
1024
use_gru
:
True
rnn_direction
:
bidirect
# [forward, bidirect]
share_rnn_weights
:
False
num_fc_layers
:
0
fc_layers_size_list
:
-1,
use_gru
:
False
blank_id
:
0
blank_id
:
0
ctc_grad_norm_type
:
instance
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
8
0
n_epoch
:
5
0
accum_grad
:
1
accum_grad
:
1
lr
:
2.0e-3
lr
:
5.0e-4
lr_decay
:
0.
8
3
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
3.0
global_grad_clip
:
3.0
log_interval
:
100
dist_sampler
:
False
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
examples/aishell/asr0/conf/deepspeech2_online.yaml
浏览文件 @
47dd61e5
...
@@ -15,28 +15,26 @@ max_output_input_ratio: .inf
...
@@ -15,28 +15,26 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
spm_model_prefix
:
'
'
random_seed
:
0
unit_type
:
'
char'
spm_model_prefix
:
preprocess_config
:
conf/preprocess.yaml
spectrum_type
:
linear
#linear, mfcc, fbank
feat_dim
:
161
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
n_fft
:
None
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq
:
None
batch_size
:
64
target_sample_rate
:
16000
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
use_dB_normalization
:
True
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
target_dB
:
-20
minibatches
:
0
# for debug
dither
:
1.0
batch_count
:
auto
keep_transcription_text
:
False
batch_bins
:
0
sortagrad
:
True
batch_frames_in
:
0
shuffle_method
:
batch_shuffle
batch_frames_out
:
0
num_workers
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
...
@@ -54,12 +52,13 @@ blank_id: 0
...
@@ -54,12 +52,13 @@ blank_id: 0
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
65
n_epoch
:
30
accum_grad
:
1
accum_grad
:
1
lr
:
5.0e-4
lr
:
5.0e-4
lr_decay
:
0.93
lr_decay
:
0.93
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
3.0
global_grad_clip
:
3.0
dist_sampler
:
False
log_interval
:
100
log_interval
:
100
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
...
...
examples/aishell/asr0/conf/tuning/decode.yaml
浏览文件 @
47dd61e5
...
@@ -2,9 +2,9 @@ decode_batch_size: 128
...
@@ -2,9 +2,9 @@ decode_batch_size: 128
error_rate_type
:
cer
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
1.9
alpha
:
2.2
beta
:
5.0
beta
:
4.3
beam_size
:
3
00
beam_size
:
5
00
cutoff_prob
:
0.99
cutoff_prob
:
0.99
cutoff_top_n
:
40
cutoff_top_n
:
40
num_proc_bsearch
:
10
num_proc_bsearch
:
10
examples/aishell/asr0/local/data.sh
浏览文件 @
47dd61e5
...
@@ -33,12 +33,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -33,12 +33,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
num_workers
=
$(
nproc
)
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--spectrum_type
=
"linear"
\
--spectrum_type
=
"fbank"
\
--feat_dim
=
161
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--stride_ms
=
10
\
--stride_ms
=
10
\
--window_ms
=
2
0
\
--window_ms
=
2
5
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
--use_dB_normalization
=
Tru
e
\
--use_dB_normalization
=
Fals
e
\
--num_samples
=
2000
\
--num_samples
=
2000
\
--num_workers
=
${
num_workers
}
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
--output_path
=
"data/mean_std.json"
...
...
examples/aishell/asr0/run.sh
浏览文件 @
47dd61e5
...
@@ -7,8 +7,7 @@ stage=0
...
@@ -7,8 +7,7 @@ stage=0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
#conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
conf_path
=
conf/deepspeech2.yaml
#conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
decode_conf_path
=
conf/tuning/decode.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
10
model_type
=
offline
# offline or online
audio_file
=
data/demo_01_03.wav
audio_file
=
data/demo_01_03.wav
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
...
examples/librispeech/asr0/conf/deepspeech2.yaml
浏览文件 @
47dd61e5
...
@@ -15,51 +15,51 @@ max_output_input_ratio: .inf
...
@@ -15,51 +15,51 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
20
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
spm_model_prefix
:
'
'
random_seed
:
0
unit_type
:
'
char'
spm_model_prefix
:
preprocess_config
:
conf/preprocess.yaml
spectrum_type
:
linear
feat_dim
:
161
feat_dim
:
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
delta_delta
:
False
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
dither
:
1.0
batch_size
:
64
use_dB_normalization
:
True
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
target_dB
:
-20
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
random_seed
:
0
minibatches
:
0
# for debug
keep_transcription_text
:
False
batch_count
:
auto
sortagrad
:
True
batch_bins
:
0
shuffle_method
:
batch_shuffle
batch_frames_in
:
0
num_workers
:
2
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
num_conv_layers
:
2
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
5
rnn_layer_size
:
2048
rnn_layer_size
:
1024
rnn_direction
:
bidirect
num_fc_layers
:
0
fc_layers_size_list
:
-1
use_gru
:
False
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
0
blank_id
:
0
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
50
n_epoch
:
15
accum_grad
:
1
accum_grad
:
1
lr
:
1.0e-3
lr
:
5.0e-4
lr_decay
:
0.
8
3
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
global_grad_clip
:
5.0
log_interval
:
100
dist_sampler
:
False
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
examples/librispeech/asr0/conf/deepspeech2_online.yaml
浏览文件 @
47dd61e5
...
@@ -15,39 +15,36 @@ max_output_input_ratio: .inf
...
@@ -15,39 +15,36 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
15
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
spm_model_prefix
:
'
'
random_seed
:
0
unit_type
:
'
char'
spm_model_prefix
:
preprocess_config
:
conf/preprocess.yaml
spectrum_type
:
linear
feat_dim
:
161
feat_dim
:
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
delta_delta
:
False
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
dither
:
1.0
batch_size
:
64
use_dB_normalization
:
True
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
target_dB
:
-20
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
random_seed
:
0
minibatches
:
0
# for debug
keep_transcription_text
:
False
batch_count
:
auto
sortagrad
:
True
batch_bins
:
0
shuffle_method
:
batch_shuffle
batch_frames_in
:
0
num_workers
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
num_conv_layers
:
2
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
5
rnn_layer_size
:
2048
rnn_layer_size
:
1024
rnn_direction
:
forward
rnn_direction
:
forward
num_fc_layers
:
2
num_fc_layers
:
0
fc_layers_size_list
:
512,
256
fc_layers_size_list
:
-1
use_gru
:
False
use_gru
:
False
blank_id
:
0
blank_id
:
0
...
@@ -55,13 +52,13 @@ blank_id: 0
...
@@ -55,13 +52,13 @@ blank_id: 0
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
50
n_epoch
:
65
accum_grad
:
4
accum_grad
:
1
lr
:
1.0e-3
lr
:
5.0e-4
lr_decay
:
0.
8
3
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
global_grad_clip
:
5.0
log_interval
:
1
00
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
examples/librispeech/asr0/local/data.sh
浏览文件 @
47dd61e5
...
@@ -49,12 +49,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -49,12 +49,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
2000
\
--num_samples
=
2000
\
--spectrum_type
=
"linear"
\
--spectrum_type
=
"fbank"
\
--feat_dim
=
161
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
--stride_ms
=
10
\
--stride_ms
=
10
\
--window_ms
=
2
0
\
--window_ms
=
2
5
\
--use_dB_normalization
=
Tru
e
\
--use_dB_normalization
=
Fals
e
\
--num_workers
=
${
num_workers
}
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
--output_path
=
"data/mean_std.json"
...
...
examples/librispeech/asr0/local/test.sh
浏览文件 @
47dd61e5
...
@@ -4,6 +4,8 @@ if [ $# != 4 ];then
...
@@ -4,6 +4,8 @@ if [ $# != 4 ];then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix model_type"
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix model_type"
exit
-1
exit
-1
fi
fi
stage
=
0
stop_stage
=
100
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
...
@@ -19,17 +21,44 @@ if [ $? -ne 0 ]; then
...
@@ -19,17 +21,44 @@ if [ $? -ne 0 ]; then
exit
1
exit
1
fi
fi
python3
-u
${
BIN_DIR
}
/test.py
\
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
--ngpu
${
ngpu
}
\
# format the reference test file
--config
${
config_path
}
\
python utils/format_rsl.py
\
--decode_cfg
${
decode_config_path
}
\
--origin_ref
data/manifest.test-clean.raw
\
--result_file
${
ckpt_prefix
}
.rsl
\
--trans_ref
data/manifest.test-clean.text
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
exit
1
exit
1
fi
python utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp
${
ckpt_prefix
}
.rsl.text
python utils/compute-wer.py
--char
=
1
--v
=
1
\
data/manifest.test-clean.text
${
ckpt_prefix
}
.rsl.text
>
${
ckpt_prefix
}
.error
fi
if
[
${
stage
}
-le
101
]
&&
[
${
stop_stage
}
-ge
101
]
;
then
python utils/format_rsl.py
\
--origin_ref
data/manifest.test-clean.raw
\
--trans_ref_sclite
data/manifest.test.text-clean.sclite
python utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp_sclite
${
ckpt_prefix
}
.rsl.text.sclite
mkdir
-p
${
ckpt_prefix
}
_sclite
sclite
-i
wsj
-r
data/manifest.test-clean.text.sclite
-h
${
ckpt_prefix
}
.rsl.text.sclite
-e
utf-8
-o
all
-O
${
ckpt_prefix
}
_sclite
-c
NOASCII
fi
fi
...
...
examples/librispeech/asr0/run.sh
浏览文件 @
47dd61e5
...
@@ -2,13 +2,12 @@
...
@@ -2,13 +2,12 @@
set
-e
set
-e
source
path.sh
source
path.sh
gpus
=
0,1,2,3
,4,5,6,7
gpus
=
0,1,2,3
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
30
avg_num
=
5
model_type
=
offline
audio_file
=
data/demo_002_en.wav
audio_file
=
data/demo_002_en.wav
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
@@ -43,6 +42,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -43,6 +42,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
||
exit
-1
fi
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
# test a single .wav file
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
fi
fi
paddlespeech/cli/asr/infer.py
浏览文件 @
47dd61e5
...
@@ -138,6 +138,7 @@ class ASRExecutor(BaseExecutor):
...
@@ -138,6 +138,7 @@ class ASRExecutor(BaseExecutor):
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
self
.
task_resource
.
set_task_model
(
tag
,
version
=
None
)
self
.
task_resource
.
set_task_model
(
tag
,
version
=
None
)
self
.
res_path
=
self
.
task_resource
.
res_dir
self
.
res_path
=
self
.
task_resource
.
res_dir
self
.
cfg_path
=
os
.
path
.
join
(
self
.
cfg_path
=
os
.
path
.
join
(
self
.
res_path
,
self
.
task_resource
.
res_dict
[
'cfg_path'
])
self
.
res_path
,
self
.
task_resource
.
res_dict
[
'cfg_path'
])
self
.
ckpt_path
=
os
.
path
.
join
(
self
.
ckpt_path
=
os
.
path
.
join
(
...
@@ -158,15 +159,18 @@ class ASRExecutor(BaseExecutor):
...
@@ -158,15 +159,18 @@ class ASRExecutor(BaseExecutor):
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
with
UpdateConfig
(
self
.
config
):
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
if
self
.
config
.
spm_model_prefix
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
if
"deepspeech2"
in
model_type
:
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
self
.
download_lm
(
self
.
download_lm
(
...
@@ -174,12 +178,6 @@ class ASRExecutor(BaseExecutor):
...
@@ -174,12 +178,6 @@ class ASRExecutor(BaseExecutor):
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
config
.
decode
.
decoding_method
=
decode_method
self
.
config
.
decode
.
decoding_method
=
decode_method
else
:
else
:
...
@@ -222,19 +220,7 @@ class ASRExecutor(BaseExecutor):
...
@@ -222,19 +220,7 @@ class ASRExecutor(BaseExecutor):
logger
.
info
(
"Preprocess audio_file:"
+
audio_file
)
logger
.
info
(
"Preprocess audio_file:"
+
audio_file
)
# Get the object for feature extraction
# Get the object for feature extraction
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
if
"deepspeech2"
in
model_type
or
"conformer"
in
model_type
or
"transformer"
in
model_type
:
audio
,
_
=
self
.
collate_fn_test
.
process_utterance
(
audio_file
=
audio_file
,
transcript
=
" "
)
audio_len
=
audio
.
shape
[
0
]
audio
=
paddle
.
to_tensor
(
audio
,
dtype
=
'float32'
)
audio_len
=
paddle
.
to_tensor
(
audio_len
)
audio
=
paddle
.
unsqueeze
(
audio
,
axis
=
0
)
# vocab_list = collate_fn_test.vocab_list
self
.
_inputs
[
"audio"
]
=
audio
self
.
_inputs
[
"audio_len"
]
=
audio_len
logger
.
info
(
f
"audio feat shape:
{
audio
.
shape
}
"
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
logger
.
info
(
"get the preprocess conf"
)
logger
.
info
(
"get the preprocess conf"
)
preprocess_conf
=
self
.
config
.
preprocess_config
preprocess_conf
=
self
.
config
.
preprocess_config
preprocess_args
=
{
"train"
:
False
}
preprocess_args
=
{
"train"
:
False
}
...
@@ -242,7 +228,6 @@ class ASRExecutor(BaseExecutor):
...
@@ -242,7 +228,6 @@ class ASRExecutor(BaseExecutor):
logger
.
info
(
"read the audio file"
)
logger
.
info
(
"read the audio file"
)
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
if
self
.
change_format
:
if
self
.
change_format
:
if
audio
.
shape
[
1
]
>=
2
:
if
audio
.
shape
[
1
]
>=
2
:
audio
=
audio
.
mean
(
axis
=
1
,
dtype
=
np
.
int16
)
audio
=
audio
.
mean
(
axis
=
1
,
dtype
=
np
.
int16
)
...
@@ -285,7 +270,7 @@ class ASRExecutor(BaseExecutor):
...
@@ -285,7 +270,7 @@ class ASRExecutor(BaseExecutor):
cfg
=
self
.
config
.
decode
cfg
=
self
.
config
.
decode
audio
=
self
.
_inputs
[
"audio"
]
audio
=
self
.
_inputs
[
"audio"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
if
"deepspeech2
online"
in
model_type
or
"deepspeech2offline
"
in
model_type
:
if
"deepspeech2"
in
model_type
:
decode_batch_size
=
audio
.
shape
[
0
]
decode_batch_size
=
audio
.
shape
[
0
]
self
.
model
.
decoder
.
init_decoder
(
self
.
model
.
decoder
.
init_decoder
(
decode_batch_size
,
self
.
text_feature
.
vocab_list
,
decode_batch_size
,
self
.
text_feature
.
vocab_list
,
...
...
paddlespeech/resource/model_alias.py
浏览文件 @
47dd61e5
...
@@ -23,7 +23,7 @@ model_alias = {
...
@@ -23,7 +23,7 @@ model_alias = {
# ---------------------------------
# ---------------------------------
"deepspeech2offline"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"deepspeech2offline"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"deepspeech2online"
:
"deepspeech2online"
:
[
"paddlespeech.s2t.models.ds2
_online:DeepSpeech2ModelOnline
"
],
[
"paddlespeech.s2t.models.ds2
:DeepSpeech2Model
"
],
"conformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer_online"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer_online"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"transformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"transformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
...
...
paddlespeech/resource/pretrained_models.py
浏览文件 @
47dd61e5
...
@@ -136,9 +136,9 @@ asr_dynamic_pretrained_models = {
...
@@ -136,9 +136,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_wenetspeech-zh-16k"
:
{
"deepspeech2online_wenetspeech-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.
0a
.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.
1
.model.tar.gz'
,
'md5'
:
'md5'
:
'
e393d4d274af0f6967db24fc146e8074
'
,
'
d1be86a3e786042ab64f05161b5fae62
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
...
@@ -152,13 +152,13 @@ asr_dynamic_pretrained_models = {
...
@@ -152,13 +152,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_aishell-zh-16k"
:
{
"deepspeech2offline_aishell-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
aishell_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
offline_aishell_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
932c3593d62fe5c741b59b31318aa314
'
,
'
4d26066c6f19f52087425dc722ae5b13
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_1'
,
'exp/deepspeech2/checkpoints/avg_1
0
'
,
'lm_url'
:
'lm_url'
:
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'lm_md5'
:
'lm_md5'
:
...
@@ -168,9 +168,9 @@ asr_dynamic_pretrained_models = {
...
@@ -168,9 +168,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_aishell-zh-16k"
:
{
"deepspeech2online_aishell-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_
0.2
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_
1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
98b87b171b7240b7cae6e07d8d0bc9be
'
,
'
df5ddeac8b679a470176649ac4b78726
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
...
@@ -188,13 +188,13 @@ asr_dynamic_pretrained_models = {
...
@@ -188,13 +188,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_librispeech-en-16k"
:
{
"deepspeech2offline_librispeech-en-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_
librispeech_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_
offline_librispeech_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
f5666c81ad015c8de03aac2bc92e5762
'
,
'
ed9e2b008a65268b3484020281ab048c
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_
1
'
,
'exp/deepspeech2/checkpoints/avg_
5
'
,
'lm_url'
:
'lm_url'
:
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm'
,
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm'
,
'lm_md5'
:
'lm_md5'
:
...
@@ -207,17 +207,17 @@ asr_static_pretrained_models = {
...
@@ -207,17 +207,17 @@ asr_static_pretrained_models = {
"deepspeech2offline_aishell-zh-16k"
:
{
"deepspeech2offline_aishell-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
aishell_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
offline_aishell_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
932c3593d62fe5c741b59b31318aa314
'
,
'
4d26066c6f19f52087425dc722ae5b13
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_1'
,
'exp/deepspeech2/checkpoints/avg_1
0
'
,
'model'
:
'model'
:
'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel'
,
'exp/deepspeech2/checkpoints/avg_1
0
.jit.pdmodel'
,
'params'
:
'params'
:
'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams'
,
'exp/deepspeech2/checkpoints/avg_1
0
.jit.pdiparams'
,
'lm_url'
:
'lm_url'
:
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'lm_md5'
:
'lm_md5'
:
...
...
paddlespeech/s2t/exps/deepspeech2/bin/export.py
浏览文件 @
47dd61e5
...
@@ -35,8 +35,6 @@ if __name__ == "__main__":
...
@@ -35,8 +35,6 @@ if __name__ == "__main__":
# save jit model to
# save jit model to
parser
.
add_argument
(
parser
.
add_argument
(
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
"offline/online"
)
parser
.
add_argument
(
parser
.
add_argument
(
'--nxpu'
,
'--nxpu'
,
type
=
int
,
type
=
int
,
...
@@ -44,7 +42,6 @@ if __name__ == "__main__":
...
@@ -44,7 +42,6 @@ if __name__ == "__main__":
choices
=
[
0
,
1
],
choices
=
[
0
,
1
],
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
print_arguments
(
args
)
print_arguments
(
args
)
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test.py
浏览文件 @
47dd61e5
...
@@ -32,8 +32,6 @@ def main(config, args):
...
@@ -32,8 +32,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
# save asr result to
# save asr result to
parser
.
add_argument
(
parser
.
add_argument
(
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
...
@@ -45,7 +43,6 @@ if __name__ == "__main__":
...
@@ -45,7 +43,6 @@ if __name__ == "__main__":
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
浏览文件 @
47dd61e5
...
@@ -38,8 +38,6 @@ if __name__ == "__main__":
...
@@ -38,8 +38,6 @@ if __name__ == "__main__":
#load jit model from
#load jit model from
parser
.
add_argument
(
parser
.
add_argument
(
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--nxpu'
,
'--nxpu'
,
type
=
int
,
type
=
int
,
...
@@ -50,7 +48,6 @@ if __name__ == "__main__":
...
@@ -50,7 +48,6 @@ if __name__ == "__main__":
"--enable-auto-log"
,
action
=
"store_true"
,
help
=
"use auto log"
)
"--enable-auto-log"
,
action
=
"store_true"
,
help
=
"use auto log"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
浏览文件 @
47dd61e5
...
@@ -23,7 +23,6 @@ from yacs.config import CfgNode
...
@@ -23,7 +23,6 @@ from yacs.config import CfgNode
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.utils
import
mp_tools
from
paddlespeech.s2t.utils
import
mp_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
...
@@ -113,12 +112,7 @@ class DeepSpeech2Tester_hub():
...
@@ -113,12 +112,7 @@ class DeepSpeech2Tester_hub():
config
.
input_dim
=
self
.
collate_fn_test
.
feature_size
config
.
input_dim
=
self
.
collate_fn_test
.
feature_size
config
.
output_dim
=
self
.
collate_fn_test
.
vocab_size
config
.
output_dim
=
self
.
collate_fn_test
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
raise
Exception
(
"wrong model type"
)
self
.
model
=
model
self
.
model
=
model
...
@@ -172,8 +166,6 @@ def main(config, args):
...
@@ -172,8 +166,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
"--audio_file"
,
type
=
str
,
help
=
'audio file path'
)
parser
.
add_argument
(
"--audio_file"
,
type
=
str
,
help
=
'audio file path'
)
# save asr result to
# save asr result to
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -184,7 +176,6 @@ if __name__ == "__main__":
...
@@ -184,7 +176,6 @@ if __name__ == "__main__":
print
(
"Please input the audio file path"
)
print
(
"Please input the audio file path"
)
sys
.
exit
(
-
1
)
sys
.
exit
(
-
1
)
check
(
args
.
audio_file
)
check
(
args
.
audio_file
)
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/train.py
浏览文件 @
47dd61e5
...
@@ -31,8 +31,6 @@ def main(config, args):
...
@@ -31,8 +31,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--nxpu'
,
'--nxpu'
,
type
=
int
,
type
=
int
,
...
@@ -40,7 +38,6 @@ if __name__ == "__main__":
...
@@ -40,7 +38,6 @@ if __name__ == "__main__":
choices
=
[
0
,
1
],
choices
=
[
0
,
1
],
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
...
...
paddlespeech/s2t/exps/deepspeech2/model.py
浏览文件 @
47dd61e5
...
@@ -23,16 +23,12 @@ import paddle
...
@@ -23,16 +23,12 @@ import paddle
from
paddle
import
distributed
as
dist
from
paddle
import
distributed
as
dist
from
paddle
import
inference
from
paddle
import
inference
from
paddle.io
import
DataLoader
from
paddle.io
import
DataLoader
from
paddlespeech.s2t.io.dataloader
import
BatchDataLoader
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.io.dataset
import
ManifestDataset
from
paddlespeech.s2t.io.dataset
import
ManifestDataset
from
paddlespeech.s2t.io.sampler
import
SortagradBatchSampler
from
paddlespeech.s2t.io.sampler
import
SortagradDistributedBatchSampler
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2InferModel
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2InferModel
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2InferModelOnline
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.training.gradclip
import
ClipGradByGlobalNormWithLog
from
paddlespeech.s2t.training.gradclip
import
ClipGradByGlobalNormWithLog
from
paddlespeech.s2t.training.reporter
import
report
from
paddlespeech.s2t.training.reporter
import
report
from
paddlespeech.s2t.training.timer
import
Timer
from
paddlespeech.s2t.training.timer
import
Timer
...
@@ -136,18 +132,13 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -136,18 +132,13 @@ class DeepSpeech2Trainer(Trainer):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
with
UpdateConfig
(
config
):
with
UpdateConfig
(
config
):
if
self
.
train
:
if
self
.
train
:
config
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
config
.
input_dim
=
self
.
train_loader
.
feat_dim
config
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
config
.
output_dim
=
self
.
train_loader
.
vocab_size
else
:
else
:
config
.
input_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
config
.
input_dim
=
self
.
test_loader
.
feat_dim
config
.
output_dim
=
self
.
test_loader
.
collate_fn
.
vocab_size
config
.
output_dim
=
self
.
test_loader
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
raise
Exception
(
"wrong model type"
)
if
self
.
parallel
:
if
self
.
parallel
:
model
=
paddle
.
DataParallel
(
model
)
model
=
paddle
.
DataParallel
(
model
)
...
@@ -175,76 +166,81 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -175,76 +166,81 @@ class DeepSpeech2Trainer(Trainer):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
config
.
defrost
()
config
.
defrost
()
if
self
.
train
:
if
self
.
train
:
# train
# train/valid dataset, return token ids
config
.
manifest
=
config
.
train_manifest
self
.
train_loader
=
BatchDataLoader
(
train_dataset
=
ManifestDataset
.
from_config
(
config
)
json_file
=
config
.
train_manifest
,
if
self
.
parallel
:
train_mode
=
True
,
batch_sampler
=
SortagradDistributedBatchSampler
(
train_dataset
,
batch_size
=
config
.
batch_size
,
num_replicas
=
None
,
rank
=
None
,
shuffle
=
True
,
drop_last
=
True
,
sortagrad
=
config
.
sortagrad
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
shuffle_method
)
else
:
batch_sampler
=
SortagradBatchSampler
(
train_dataset
,
shuffle
=
True
,
batch_size
=
config
.
batch_size
,
batch_size
=
config
.
batch_size
,
drop_last
=
True
,
maxlen_in
=
config
.
maxlen_in
,
sortagrad
=
config
.
sortagrad
,
maxlen_out
=
config
.
maxlen_out
,
shuffle_method
=
config
.
shuffle_method
)
minibatches
=
config
.
minibatches
,
mini_batch_size
=
self
.
args
.
ngpu
,
config
.
keep_transcription_text
=
False
batch_count
=
config
.
batch_count
,
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
batch_bins
=
config
.
batch_bins
,
self
.
train_loader
=
DataLoader
(
batch_frames_in
=
config
.
batch_frames_in
,
train_dataset
,
batch_frames_out
=
config
.
batch_frames_out
,
batch_sampler
=
batch_sampler
,
batch_frames_inout
=
config
.
batch_frames_inout
,
collate_fn
=
collate_fn_train
,
preprocess_conf
=
config
.
preprocess_config
,
num_workers
=
config
.
num_workers
)
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
# dev
num_encs
=
1
,
config
.
manifest
=
config
.
dev_manifest
dist_sampler
=
config
.
get
(
'dist_sampler'
,
False
),
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
shortest_first
=
False
)
config
.
augmentation_config
=
""
self
.
valid_loader
=
BatchDataLoader
(
config
.
keep_transcription_text
=
False
json_file
=
config
.
dev_manifest
,
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
train_mode
=
False
,
self
.
valid_loader
=
DataLoader
(
sortagrad
=
False
,
dev_dataset
,
batch_size
=
config
.
batch_size
,
batch_size
=
int
(
config
.
batch_size
),
maxlen_in
=
float
(
'inf'
),
shuffle
=
False
,
maxlen_out
=
float
(
'inf'
),
drop_last
=
False
,
minibatches
=
0
,
collate_fn
=
collate_fn_dev
,
mini_batch_size
=
self
.
args
.
ngpu
,
num_workers
=
config
.
num_workers
)
batch_count
=
'auto'
,
batch_bins
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
preprocess_config
,
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
num_encs
=
1
,
dist_sampler
=
config
.
get
(
'dist_sampler'
,
False
),
shortest_first
=
False
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
else
:
else
:
# test
config
.
manifest
=
config
.
test_manifest
test_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
augmentation_config
=
""
config
.
keep_transcription_text
=
True
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
'decode_batch_size'
,
1
)
'decode_batch_size'
,
1
)
self
.
test_loader
=
DataLoader
(
# test dataset, return raw text
test_dataset
,
self
.
test_loader
=
BatchDataLoader
(
json_file
=
config
.
test_manifest
,
train_mode
=
False
,
sortagrad
=
False
,
batch_size
=
decode_batch_size
,
batch_size
=
decode_batch_size
,
shuffle
=
False
,
maxlen_in
=
float
(
'inf'
),
drop_last
=
False
,
maxlen_out
=
float
(
'inf'
),
collate_fn
=
collate_fn_test
,
minibatches
=
0
,
num_workers
=
config
.
num_workers
)
mini_batch_size
=
1
,
logger
.
info
(
"Setup test Dataloader!"
)
batch_count
=
'auto'
,
batch_bins
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
preprocess_config
,
n_iter_processes
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
logger
.
info
(
"Setup test/align Dataloader!"
)
class
DeepSpeech2Tester
(
DeepSpeech2Trainer
):
class
DeepSpeech2Tester
(
DeepSpeech2Trainer
):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
self
.
_text_featurizer
=
TextFeaturizer
(
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
unit_type
,
vocab
=
None
)
unit_type
=
config
.
unit_type
,
vocab
=
config
.
vocab_filepath
)
self
.
vocab_list
=
self
.
_text_featurizer
.
vocab_list
def
ordid2token
(
self
,
texts
,
texts_len
):
def
ordid2token
(
self
,
texts
,
texts_len
):
""" ord() id to chr() chr """
""" ord() id to chr() chr """
...
@@ -252,7 +248,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -252,7 +248,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
for
text
,
n
in
zip
(
texts
,
texts_len
):
for
text
,
n
in
zip
(
texts
,
texts_len
):
n
=
n
.
numpy
().
item
()
n
=
n
.
numpy
().
item
()
ids
=
text
[:
n
]
ids
=
text
[:
n
]
trans
.
append
(
''
.
join
([
chr
(
i
)
for
i
in
ids
]))
#trans.append(''.join([chr(i) for i in ids]))
trans
.
append
(
self
.
_text_featurizer
.
defeaturize
(
ids
.
numpy
().
tolist
()))
return
trans
return
trans
def
compute_metrics
(
self
,
def
compute_metrics
(
self
,
...
@@ -307,8 +304,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -307,8 +304,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
# Initialized the decoder in model
# Initialized the decoder in model
decode_cfg
=
self
.
config
.
decode
decode_cfg
=
self
.
config
.
decode
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
vocab_list
=
self
.
vocab_list
decode_batch_size
=
self
.
test_loader
.
batch_size
decode_batch_size
=
decode_cfg
.
decode_
batch_size
self
.
model
.
decoder
.
init_decoder
(
self
.
model
.
decoder
.
init_decoder
(
decode_batch_size
,
vocab_list
,
decode_cfg
.
decoding_method
,
decode_batch_size
,
vocab_list
,
decode_cfg
.
decoding_method
,
decode_cfg
.
lang_model_path
,
decode_cfg
.
alpha
,
decode_cfg
.
beta
,
decode_cfg
.
lang_model_path
,
decode_cfg
.
alpha
,
decode_cfg
.
beta
,
...
@@ -338,17 +335,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -338,17 +335,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
export
(
self
):
def
export
(
self
):
if
self
.
args
.
model_type
==
'offline'
:
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
elif
self
.
args
.
model_type
==
'online'
:
infer_model
=
DeepSpeech2InferModelOnline
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
else
:
raise
Exception
(
"wrong model type"
)
infer_model
.
eval
()
infer_model
.
eval
()
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
static_model
=
infer_model
.
export
()
static_model
=
infer_model
.
export
()
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
...
@@ -376,10 +365,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -376,10 +365,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
# Initialized the decoder in model
# Initialized the decoder in model
decode_cfg
=
self
.
config
.
decode
decode_cfg
=
self
.
config
.
decode
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
vocab_list
=
self
.
vocab_list
if
self
.
args
.
model_type
==
"online
"
:
if
self
.
config
.
rnn_direction
==
"forward
"
:
decode_batch_size
=
1
decode_batch_size
=
1
elif
self
.
args
.
model_type
==
"offline
"
:
elif
self
.
config
.
rnn_direction
==
"bidirect
"
:
decode_batch_size
=
self
.
test_loader
.
batch_size
decode_batch_size
=
self
.
test_loader
.
batch_size
else
:
else
:
raise
Exception
(
"wrong model type"
)
raise
Exception
(
"wrong model type"
)
...
@@ -412,11 +401,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -412,11 +401,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self
.
model
.
decoder
.
del_decoder
()
self
.
model
.
decoder
.
del_decoder
()
def
compute_result_transcripts
(
self
,
audio
,
audio_len
):
def
compute_result_transcripts
(
self
,
audio
,
audio_len
):
if
self
.
args
.
model_type
==
"online
"
:
if
self
.
config
.
rnn_direction
==
"forward
"
:
output_probs
,
output_lens
,
trans_batch
=
self
.
static_forward_online
(
output_probs
,
output_lens
,
trans_batch
=
self
.
static_forward_online
(
audio
,
audio_len
,
decoder_chunk_size
=
1
)
audio
,
audio_len
,
decoder_chunk_size
=
1
)
result_transcripts
=
[
trans
[
-
1
]
for
trans
in
trans_batch
]
result_transcripts
=
[
trans
[
-
1
]
for
trans
in
trans_batch
]
elif
self
.
args
.
model_type
==
"offline
"
:
elif
self
.
config
.
rnn_direction
==
"bidirect
"
:
output_probs
,
output_lens
=
self
.
static_forward_offline
(
audio
,
output_probs
,
output_lens
=
self
.
static_forward_offline
(
audio
,
audio_len
)
audio_len
)
batch_size
=
output_probs
.
shape
[
0
]
batch_size
=
output_probs
.
shape
[
0
]
...
...
paddlespeech/s2t/models/ds2/conv.py
浏览文件 @
47dd61e5
...
@@ -11,161 +11,23 @@
...
@@ -11,161 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
paddle
import
nn
import
paddle
from
paddle.nn
import
functional
as
F
from
paddlespeech.s2t.modules.activation
import
brelu
from
paddlespeech.s2t.modules.subsampling
import
Conv2dSubsampling4
from
paddlespeech.s2t.modules.mask
import
make_non_pad_mask
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'ConvStack'
,
"conv_output_size"
]
class
Conv2dSubsampling4Pure
(
Conv2dSubsampling4
):
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
):
super
().
__init__
(
idim
,
odim
,
dropout_rate
,
None
)
self
.
output_dim
=
((
idim
-
1
)
//
2
-
1
)
//
2
*
odim
self
.
receptive_field_length
=
2
*
(
3
-
1
)
+
3
# stride_1 * (kernel_size_2 - 1) + kerel_size_1
def
forward
(
self
,
x
:
paddle
.
Tensor
,
def
conv_output_size
(
I
,
F
,
P
,
S
):
x_len
:
paddle
.
Tensor
)
->
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
# Output size after Conv:
# By noting I the length of the input volume size,
# F the length of the filter,
# P the amount of zero padding,
# S the stride,
# then the output size O of the feature map along that dimension is given by:
# O = (I - F + Pstart + Pend) // S + 1
# When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
# When Pstart == Pend == 0
# O = (I - F - S) // S
# https://iq.opengenus.org/output-size-of-convolution/
# Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
# Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
return
(
I
-
F
+
2
*
P
-
S
)
//
S
# receptive field calculator
# https://fomoro.com/research/article/receptive-field-calculator
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://distill.pub/2019/computing-receptive-fields/
# Rl-1 = Sl * Rl + (Kl - Sl)
class
ConvBn
(
nn
.
Layer
):
"""Convolution layer with batch normalization.
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type kernel_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:param num_channels_out: Number of output channels.
:type num_channels_out: int
:param stride: The x dimension of the stride. Or input a tuple for two
image dimension.
:type stride: int|tuple|list
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type, relu|brelu
:type act: string
:return: Batch norm layer after convolution layer.
:rtype: Variable
"""
def
__init__
(
self
,
num_channels_in
,
num_channels_out
,
kernel_size
,
stride
,
padding
,
act
):
super
().
__init__
()
assert
len
(
kernel_size
)
==
2
assert
len
(
stride
)
==
2
assert
len
(
padding
)
==
2
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
padding
=
padding
self
.
conv
=
nn
.
Conv2D
(
num_channels_in
,
num_channels_out
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
weight_attr
=
None
,
bias_attr
=
False
,
data_format
=
'NCHW'
)
self
.
bn
=
nn
.
BatchNorm2D
(
num_channels_out
,
weight_attr
=
None
,
bias_attr
=
None
,
data_format
=
'NCHW'
)
self
.
act
=
F
.
relu
if
act
==
'relu'
else
brelu
def
forward
(
self
,
x
,
x_len
):
"""
x(Tensor): audio, shape [B, C, D, T]
"""
x
=
self
.
conv
(
x
)
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
#b, c, t, f = paddle.shape(x) #not work under jit
x
=
self
.
act
(
x
)
x
=
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
0
,
0
,
-
1
])
x_len
=
((
x_len
-
1
)
//
2
-
1
)
//
2
x_len
=
(
x_len
-
self
.
kernel_size
[
1
]
+
2
*
self
.
padding
[
1
]
)
//
self
.
stride
[
1
]
+
1
# reset padding part to 0
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
1
).
unsqueeze
(
1
)
# [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
class
ConvStack
(
nn
.
Layer
):
"""Convolution group with stacked convolution layers.
:param feat_size: audio feature dim.
:type feat_size: int
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
"""
def
__init__
(
self
,
feat_size
,
num_stacks
):
super
().
__init__
()
self
.
feat_size
=
feat_size
# D
self
.
num_stacks
=
num_stacks
self
.
conv_in
=
ConvBn
(
num_channels_in
=
1
,
num_channels_out
=
32
,
kernel_size
=
(
41
,
11
),
#[D, T]
stride
=
(
2
,
3
),
padding
=
(
20
,
5
),
act
=
'brelu'
)
out_channel
=
32
convs
=
[
ConvBn
(
num_channels_in
=
32
,
num_channels_out
=
out_channel
,
kernel_size
=
(
21
,
11
),
stride
=
(
2
,
1
),
padding
=
(
10
,
5
),
act
=
'brelu'
)
for
i
in
range
(
num_stacks
-
1
)
]
self
.
conv_stack
=
nn
.
LayerList
(
convs
)
# conv output feat_dim
output_height
=
(
feat_size
-
1
)
//
2
+
1
for
i
in
range
(
self
.
num_stacks
-
1
):
output_height
=
(
output_height
-
1
)
//
2
+
1
self
.
output_height
=
out_channel
*
output_height
def
forward
(
self
,
x
,
x_len
):
"""
x: shape [B, C, D, T]
x_len : shape [B]
"""
x
,
x_len
=
self
.
conv_in
(
x
,
x_len
)
for
i
,
conv
in
enumerate
(
self
.
conv_stack
):
x
,
x_len
=
conv
(
x
,
x_len
)
return
x
,
x_len
return
x
,
x_len
paddlespeech/s2t/models/ds2/deepspeech2.py
浏览文件 @
47dd61e5
...
@@ -13,15 +13,14 @@
...
@@ -13,15 +13,14 @@
# limitations under the License.
# limitations under the License.
"""Deepspeech2 ASR Model"""
"""Deepspeech2 ASR Model"""
import
paddle
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddle
import
nn
from
paddlespeech.s2t.models.ds2.conv
import
ConvStack
from
paddlespeech.s2t.models.ds2.conv
import
Conv2dSubsampling4Pure
from
paddlespeech.s2t.models.ds2.rnn
import
RNNStack
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'DeepSpeech2Model'
,
'DeepSpeech2InferModel'
]
__all__
=
[
'DeepSpeech2Model'
,
'DeepSpeech2InferModel'
]
...
@@ -32,72 +31,197 @@ class CRNNEncoder(nn.Layer):
...
@@ -32,72 +31,197 @@ class CRNNEncoder(nn.Layer):
feat_size
,
feat_size
,
dict_size
,
dict_size
,
num_conv_layers
=
2
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_size
=
1024
,
use_gru
=
False
,
rnn_direction
=
'forward'
,
share_rnn_weights
=
True
):
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
):
super
().
__init__
()
super
().
__init__
()
self
.
rnn_size
=
rnn_size
self
.
rnn_size
=
rnn_size
self
.
feat_size
=
feat_size
# 161 for linear
self
.
feat_size
=
feat_size
# 161 for linear
self
.
dict_size
=
dict_size
self
.
dict_size
=
dict_size
self
.
num_rnn_layers
=
num_rnn_layers
self
.
conv
=
ConvStack
(
feat_size
,
num_conv_layers
)
self
.
num_fc_layers
=
num_fc_layers
self
.
rnn_direction
=
rnn_direction
i_size
=
self
.
conv
.
output_height
# H after conv stack
self
.
fc_layers_size_list
=
fc_layers_size_list
self
.
rnn
=
RNNStack
(
self
.
use_gru
=
use_gru
i_size
=
i_size
,
self
.
conv
=
Conv2dSubsampling4Pure
(
feat_size
,
32
,
dropout_rate
=
0.0
)
h_size
=
rnn_size
,
num_stacks
=
num_rnn_layers
,
self
.
output_dim
=
self
.
conv
.
output_dim
use_gru
=
use_gru
,
share_rnn_weights
=
share_rnn_weights
)
i_size
=
self
.
conv
.
output_dim
self
.
rnn
=
nn
.
LayerList
()
self
.
layernorm_list
=
nn
.
LayerList
()
self
.
fc_layers_list
=
nn
.
LayerList
()
if
rnn_direction
==
'bidirect'
or
rnn_direction
==
'bidirectional'
:
layernorm_size
=
2
*
rnn_size
elif
rnn_direction
==
'forward'
:
layernorm_size
=
rnn_size
else
:
raise
Exception
(
"Wrong rnn direction"
)
for
i
in
range
(
0
,
num_rnn_layers
):
if
i
==
0
:
rnn_input_size
=
i_size
else
:
rnn_input_size
=
layernorm_size
if
use_gru
is
True
:
self
.
rnn
.
append
(
nn
.
GRU
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
else
:
self
.
rnn
.
append
(
nn
.
LSTM
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
self
.
output_dim
=
layernorm_size
fc_input_size
=
layernorm_size
for
i
in
range
(
self
.
num_fc_layers
):
self
.
fc_layers_list
.
append
(
nn
.
Linear
(
fc_input_size
,
fc_layers_size_list
[
i
]))
fc_input_size
=
fc_layers_size_list
[
i
]
self
.
output_dim
=
fc_layers_size_list
[
i
]
@
property
@
property
def
output_size
(
self
):
def
output_size
(
self
):
return
self
.
rnn_size
*
2
return
self
.
output_dim
def
forward
(
self
,
audio
,
audio_len
):
def
forward
(
self
,
x
,
x_lens
,
init_state_h_box
=
None
,
init_state_c_box
=
None
):
"""Compute Encoder outputs
"""Compute Encoder outputs
Args:
Args:
audio (Tensor): [B, Tmax
, D]
x (Tensor): [B, T
, D]
text (Tensor): [B, Umax
]
x_lens (Tensor): [B
]
audio_len (Tensor): [B
]
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size
]
text_len (Tensor): [B
]
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size
]
Return
s
:
Return:
x (Tensor): encoder outputs, [B, T, D]
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
"""
# [B, T, D] -> [B, D, T]
if
init_state_h_box
is
not
None
:
audio
=
audio
.
transpose
([
0
,
2
,
1
])
init_state_list
=
None
# [B, D, T] -> [B, C=1, D, T]
x
=
audio
.
unsqueeze
(
1
)
if
self
.
use_gru
is
True
:
x_lens
=
audio_len
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
init_state_h_list
else
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
[(
init_state_h_list
[
i
],
init_state_c_list
[
i
])
for
i
in
range
(
self
.
num_rnn_layers
)]
else
:
init_state_list
=
[
None
]
*
self
.
num_rnn_layers
# convolution group
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
final_chunk_state_list
=
[]
for
i
in
range
(
0
,
self
.
num_rnn_layers
):
x
,
final_state
=
self
.
rnn
[
i
](
x
,
init_state_list
[
i
],
x_lens
)
#[B, T, D]
final_chunk_state_list
.
append
(
final_state
)
x
=
self
.
layernorm_list
[
i
](
x
)
for
i
in
range
(
self
.
num_fc_layers
):
x
=
self
.
fc_layers_list
[
i
](
x
)
x
=
F
.
relu
(
x
)
if
self
.
use_gru
is
True
:
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_list
,
axis
=
0
)
final_chunk_state_c_box
=
init_state_c_box
else
:
final_chunk_state_h_list
=
[
final_chunk_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_c_list
=
[
final_chunk_state_list
[
i
][
1
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_h_list
,
axis
=
0
)
final_chunk_state_c_box
=
paddle
.
concat
(
final_chunk_state_c_list
,
axis
=
0
)
return
x
,
x_lens
,
final_chunk_state_h_box
,
final_chunk_state_c_box
def
forward_chunk_by_chunk
(
self
,
x
,
x_lens
,
decoder_chunk_size
=
8
):
"""Compute Encoder outputs
# convert data from convolution feature map to sequence of vectors
Args:
#B, C, D, T = paddle.shape(x) # not work under jit
x (Tensor): [B, T, D]
x
=
x
.
transpose
([
0
,
3
,
1
,
2
])
#[B, T, C, D]
x_lens (Tensor): [B]
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
decoder_chunk_size: The chunk size of decoder
x
=
x
.
reshape
([
0
,
0
,
-
1
])
#[B, T, C*D]
Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
# remove padding part
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
x
,
x_lens
=
self
.
rnn
(
x
,
x_lens
)
#[B, T, D]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
return
x
,
x_lens
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate
=
self
.
conv
.
subsampling_rate
receptive_field_length
=
self
.
conv
.
receptive_field_length
chunk_size
=
(
decoder_chunk_size
-
1
)
*
subsampling_rate
+
receptive_field_length
chunk_stride
=
subsampling_rate
*
decoder_chunk_size
max_len
=
x
.
shape
[
1
]
assert
(
chunk_size
<=
max_len
)
eouts_chunk_list
=
[]
eouts_chunk_lens_list
=
[]
if
(
max_len
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len
=
chunk_stride
-
(
max_len
-
chunk_size
)
%
chunk_stride
else
:
padding_len
=
0
padding
=
paddle
.
zeros
((
x
.
shape
[
0
],
padding_len
,
x
.
shape
[
2
]))
padded_x
=
paddle
.
concat
([
x
,
padding
],
axis
=
1
)
num_chunk
=
(
max_len
+
padding_len
-
chunk_size
)
/
chunk_stride
+
1
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
None
chunk_state_c_box
=
None
final_state_h_box
=
None
final_state_c_box
=
None
for
i
in
range
(
0
,
num_chunk
):
start
=
i
*
chunk_stride
end
=
start
+
chunk_size
x_chunk
=
padded_x
[:,
start
:
end
,
:]
x_len_left
=
paddle
.
where
(
x_lens
-
i
*
chunk_stride
<
0
,
paddle
.
zeros_like
(
x_lens
),
x_lens
-
i
*
chunk_stride
)
x_chunk_len_tmp
=
paddle
.
ones_like
(
x_lens
)
*
chunk_size
x_chunk_lens
=
paddle
.
where
(
x_len_left
<
x_chunk_len_tmp
,
x_len_left
,
x_chunk_len_tmp
)
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
(
x_chunk
,
x_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
eouts_chunk_list
.
append
(
eouts_chunk
)
eouts_chunk_lens_list
.
append
(
eouts_chunk_lens
)
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
return
eouts_chunk_list
,
eouts_chunk_lens_list
,
final_state_h_box
,
final_state_c_box
class
DeepSpeech2Model
(
nn
.
Layer
):
class
DeepSpeech2Model
(
nn
.
Layer
):
"""The DeepSpeech2 network structure.
"""The DeepSpeech2 network structure.
:param audio
_data
: Audio spectrogram data layer.
:param audio: Audio spectrogram data layer.
:type audio
_data
: Variable
:type audio: Variable
:param text
_data
: Transcription text data layer.
:param text: Transcription text data layer.
:type text
_data
: Variable
:type text: Variable
:param audio_len: Valid sequence length data layer.
:param audio_len: Valid sequence length data layer.
:type audio_len: Variable
:type audio_len: Variable
:param
masks: Masks data layer to reset padding
.
:param
feat_size: feature size for audio
.
:type
masks: Variable
:type
feat_size: int
:param dict_size: Dictionary size for tokenized transcription.
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:param num_conv_layers: Number of stacking convolution layers.
...
@@ -106,37 +230,41 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -106,37 +230,41 @@ class DeepSpeech2Model(nn.Layer):
:type num_rnn_layers: int
:type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells).
:param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int
:type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False.
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward direction RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: A tuple of an output unnormalized log probability layer (
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
:rtype: tuple of LayerOutput
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
feat_size
,
feat_size
,
dict_size
,
dict_size
,
num_conv_layers
=
2
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
,
use_gru
=
False
,
share_rnn_weights
=
True
,
blank_id
=
0
,
blank_id
=
0
,
ctc_grad_norm_type
=
None
):
ctc_grad_norm_type
=
None
,
):
super
().
__init__
()
super
().
__init__
()
self
.
encoder
=
CRNNEncoder
(
self
.
encoder
=
CRNNEncoder
(
feat_size
=
feat_size
,
feat_size
=
feat_size
,
dict_size
=
dict_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_direction
=
rnn_direction
,
num_fc_layers
=
num_fc_layers
,
fc_layers_size_list
=
fc_layers_size_list
,
rnn_size
=
rnn_size
,
rnn_size
=
rnn_size
,
use_gru
=
use_gru
,
use_gru
=
use_gru
)
share_rnn_weights
=
share_rnn_weights
)
assert
(
self
.
encoder
.
output_size
==
rnn_size
*
2
)
self
.
decoder
=
CTCDecoder
(
self
.
decoder
=
CTCDecoder
(
odim
=
dict_size
,
# <blank> is in vocab
odim
=
dict_size
,
# <blank> is in vocab
...
@@ -151,7 +279,7 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -151,7 +279,7 @@ class DeepSpeech2Model(nn.Layer):
"""Compute Model loss
"""Compute Model loss
Args:
Args:
audio (Tensor
s
): [B, T, D]
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
audio_len (Tensor): [B]
text (Tensor): [B, U]
text (Tensor): [B, U]
text_len (Tensor): [B]
text_len (Tensor): [B]
...
@@ -159,22 +287,22 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -159,22 +287,22 @@ class DeepSpeech2Model(nn.Layer):
Returns:
Returns:
loss (Tensor): [1]
loss (Tensor): [1]
"""
"""
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
return
loss
return
loss
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
decode
(
self
,
audio
,
audio_len
):
def
decode
(
self
,
audio
,
audio_len
):
# decoders only accept string encoded in utf-8
# decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized
# Make sure the decoder has been initialized
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
batch_size
=
probs
.
shape
[
0
]
batch_size
=
probs
.
shape
[
0
]
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
return
trans_best
return
trans_best
@
classmethod
@
classmethod
...
@@ -196,13 +324,15 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -196,13 +324,15 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result.
The model built from pretrained result.
"""
"""
model
=
cls
(
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
feat_size
=
dataloader
.
feat_dim
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
dict_size
=
dataloader
.
vocab_size
,
num_conv_layers
=
config
.
num_conv_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
share_rnn_weights
,
blank_id
=
config
.
blank_id
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
infos
=
Checkpoint
().
load_parameters
(
...
@@ -229,8 +359,10 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -229,8 +359,10 @@ class DeepSpeech2Model(nn.Layer):
num_conv_layers
=
config
.
num_conv_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
share_rnn_weights
,
blank_id
=
config
.
blank_id
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
return
model
return
model
...
@@ -240,21 +372,37 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
...
@@ -240,21 +372,37 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
audio
,
audio_len
):
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
=
None
,
"""export model function
chunk_state_c_box
=
None
):
if
self
.
encoder
.
rnn_direction
==
"forward"
:
Args:
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio (Tensor): [B, T, D]
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
audio_len (Tensor): [B]
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
Returns:
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
probs: probs after softmax
eouts
,
eouts_len
,
_
,
_
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
)
"""
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
return
probs
,
eouts_len
return
probs
,
eouts_len
else
:
raise
Exception
(
"wrong model type"
)
def
export
(
self
):
def
export
(
self
):
if
self
.
encoder
.
rnn_direction
==
"forward"
:
static_model
=
paddle
.
jit
.
to_static
(
self
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
)
])
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
static_model
=
paddle
.
jit
.
to_static
(
static_model
=
paddle
.
jit
.
to_static
(
self
,
self
,
input_spec
=
[
input_spec
=
[
...
@@ -264,4 +412,6 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
...
@@ -264,4 +412,6 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
dtype
=
'int64'
),
# audio_length, [B]
])
])
else
:
raise
Exception
(
"wrong model type"
)
return
static_model
return
static_model
paddlespeech/s2t/models/ds2/rnn.py
已删除
100644 → 0
浏览文件 @
0fa32e4a
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
from
paddle.nn
import
initializer
as
I
from
paddlespeech.s2t.modules.activation
import
brelu
from
paddlespeech.s2t.modules.mask
import
make_non_pad_mask
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'RNNStack'
]
class
RNNCell
(
nn
.
RNNCellBase
):
r
"""
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
computes the outputs and updates states.
The formula used is as follows:
.. math::
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
y_{t} & = h_{t}
where :math:`act` is for :attr:`activation`.
"""
def
__init__
(
self
,
hidden_size
:
int
,
activation
=
"tanh"
,
weight_ih_attr
=
None
,
weight_hh_attr
=
None
,
bias_ih_attr
=
None
,
bias_hh_attr
=
None
,
name
=
None
):
super
().
__init__
()
std
=
1.0
/
math
.
sqrt
(
hidden_size
)
self
.
weight_hh
=
self
.
create_parameter
(
(
hidden_size
,
hidden_size
),
weight_hh_attr
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
bias_ih
=
None
self
.
bias_hh
=
self
.
create_parameter
(
(
hidden_size
,
),
bias_hh_attr
,
is_bias
=
True
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
hidden_size
=
hidden_size
if
activation
not
in
[
"tanh"
,
"relu"
,
"brelu"
]:
raise
ValueError
(
"activation for SimpleRNNCell should be tanh or relu, "
"but get {}"
.
format
(
activation
))
self
.
activation
=
activation
self
.
_activation_fn
=
paddle
.
tanh
\
if
activation
==
"tanh"
\
else
F
.
relu
if
activation
==
'brelu'
:
self
.
_activation_fn
=
brelu
def
forward
(
self
,
inputs
,
states
=
None
):
if
states
is
None
:
states
=
self
.
get_initial_states
(
inputs
,
self
.
state_shape
)
pre_h
=
states
i2h
=
inputs
if
self
.
bias_ih
is
not
None
:
i2h
+=
self
.
bias_ih
h2h
=
paddle
.
matmul
(
pre_h
,
self
.
weight_hh
,
transpose_y
=
True
)
if
self
.
bias_hh
is
not
None
:
h2h
+=
self
.
bias_hh
h
=
self
.
_activation_fn
(
i2h
+
h2h
)
return
h
,
h
@
property
def
state_shape
(
self
):
return
(
self
.
hidden_size
,
)
class
GRUCell
(
nn
.
RNNCellBase
):
r
"""
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
it computes the outputs and updates states.
The formula for GRU used is as follows:
.. math::
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
y_{t} & = h_{t}
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
multiplication operator.
"""
def
__init__
(
self
,
input_size
:
int
,
hidden_size
:
int
,
weight_ih_attr
=
None
,
weight_hh_attr
=
None
,
bias_ih_attr
=
None
,
bias_hh_attr
=
None
,
name
=
None
):
super
().
__init__
()
std
=
1.0
/
math
.
sqrt
(
hidden_size
)
self
.
weight_hh
=
self
.
create_parameter
(
(
3
*
hidden_size
,
hidden_size
),
weight_hh_attr
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
bias_ih
=
None
self
.
bias_hh
=
self
.
create_parameter
(
(
3
*
hidden_size
,
),
bias_hh_attr
,
is_bias
=
True
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
hidden_size
=
hidden_size
self
.
input_size
=
input_size
self
.
_gate_activation
=
F
.
sigmoid
self
.
_activation
=
paddle
.
tanh
def
forward
(
self
,
inputs
,
states
=
None
):
if
states
is
None
:
states
=
self
.
get_initial_states
(
inputs
,
self
.
state_shape
)
pre_hidden
=
states
x_gates
=
inputs
if
self
.
bias_ih
is
not
None
:
x_gates
=
x_gates
+
self
.
bias_ih
h_gates
=
paddle
.
matmul
(
pre_hidden
,
self
.
weight_hh
,
transpose_y
=
True
)
if
self
.
bias_hh
is
not
None
:
h_gates
=
h_gates
+
self
.
bias_hh
x_r
,
x_z
,
x_c
=
paddle
.
split
(
x_gates
,
num_or_sections
=
3
,
axis
=
1
)
h_r
,
h_z
,
h_c
=
paddle
.
split
(
h_gates
,
num_or_sections
=
3
,
axis
=
1
)
r
=
self
.
_gate_activation
(
x_r
+
h_r
)
z
=
self
.
_gate_activation
(
x_z
+
h_z
)
c
=
self
.
_activation
(
x_c
+
r
*
h_c
)
# apply reset gate after mm
h
=
(
pre_hidden
-
c
)
*
z
+
c
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
return
h
,
h
@
property
def
state_shape
(
self
):
r
"""
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
size would be automatically inserted into shape). The shape corresponds
to the shape of :math:`h_{t-1}`.
"""
return
(
self
.
hidden_size
,
)
class
BiRNNWithBN
(
nn
.
Layer
):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param size: Dimension of RNN cells.
:type size: int
:param share_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
:type share_weights: bool
:return: Bidirectional simple rnn layer.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
,
share_weights
:
bool
):
super
().
__init__
()
self
.
share_weights
=
share_weights
if
self
.
share_weights
:
#input-hidden weights shared between bi-directional rnn.
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
# batch norm is only performed on input-state projection
self
.
fw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
self
.
fw_fc
self
.
bw_bn
=
self
.
fw_bn
else
:
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
self
.
fw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
self
.
bw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
fw_cell
=
RNNCell
(
hidden_size
=
h_size
,
activation
=
'brelu'
)
self
.
bw_cell
=
RNNCell
(
hidden_size
=
h_size
,
activation
=
'brelu'
)
self
.
fw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
False
,
time_major
=
False
)
#[B, T, D]
self
.
bw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
True
,
time_major
=
False
)
#[B, T, D]
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
):
# x, shape [B, T, D]
fw_x
=
self
.
fw_bn
(
self
.
fw_fc
(
x
))
bw_x
=
self
.
bw_bn
(
self
.
bw_fc
(
x
))
fw_x
,
_
=
self
.
fw_rnn
(
inputs
=
fw_x
,
sequence_length
=
x_len
)
bw_x
,
_
=
self
.
bw_rnn
(
inputs
=
bw_x
,
sequence_length
=
x_len
)
x
=
paddle
.
concat
([
fw_x
,
bw_x
],
axis
=-
1
)
return
x
,
x_len
class
BiGRUWithBN
(
nn
.
Layer
):
"""Bidirectonal gru layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: Variable
:param size: Dimension of GRU cells.
:type size: int
:param act: Activation type.
:type act: string
:return: Bidirectional GRU layer.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
):
super
().
__init__
()
hidden_size
=
h_size
*
3
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
hidden_size
,
bias_attr
=
False
)
self
.
fw_bn
=
nn
.
BatchNorm1D
(
hidden_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
nn
.
Linear
(
i_size
,
hidden_size
,
bias_attr
=
False
)
self
.
bw_bn
=
nn
.
BatchNorm1D
(
hidden_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
fw_cell
=
GRUCell
(
input_size
=
hidden_size
,
hidden_size
=
h_size
)
self
.
bw_cell
=
GRUCell
(
input_size
=
hidden_size
,
hidden_size
=
h_size
)
self
.
fw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
False
,
time_major
=
False
)
#[B, T, D]
self
.
bw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
True
,
time_major
=
False
)
#[B, T, D]
def
forward
(
self
,
x
,
x_len
):
# x, shape [B, T, D]
fw_x
=
self
.
fw_bn
(
self
.
fw_fc
(
x
))
bw_x
=
self
.
bw_bn
(
self
.
bw_fc
(
x
))
fw_x
,
_
=
self
.
fw_rnn
(
inputs
=
fw_x
,
sequence_length
=
x_len
)
bw_x
,
_
=
self
.
bw_rnn
(
inputs
=
bw_x
,
sequence_length
=
x_len
)
x
=
paddle
.
concat
([
fw_x
,
bw_x
],
axis
=-
1
)
return
x
,
x_len
class
RNNStack
(
nn
.
Layer
):
"""RNN group with stacked bidirectional simple RNN or GRU layers.
:param input: Input layer.
:type input: Variable
:param size: Dimension of RNN cells in each layer.
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: Output layer of the RNN group.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
,
num_stacks
:
int
,
use_gru
:
bool
,
share_rnn_weights
:
bool
):
super
().
__init__
()
rnn_stacks
=
[]
for
i
in
range
(
num_stacks
):
if
use_gru
:
#default:GRU using tanh
rnn_stacks
.
append
(
BiGRUWithBN
(
i_size
=
i_size
,
h_size
=
h_size
))
else
:
rnn_stacks
.
append
(
BiRNNWithBN
(
i_size
=
i_size
,
h_size
=
h_size
,
share_weights
=
share_rnn_weights
))
i_size
=
h_size
*
2
self
.
rnn_stacks
=
nn
.
LayerList
(
rnn_stacks
)
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
):
"""
x: shape [B, T, D]
x_len: shpae [B]
"""
for
i
,
rnn
in
enumerate
(
self
.
rnn_stacks
):
x
,
x_len
=
rnn
(
x
,
x_len
)
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
-
1
)
# [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
paddlespeech/s2t/models/ds2_online/__init__.py
已删除
100644 → 0
浏览文件 @
0fa32e4a
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.deepspeech2
import
DeepSpeech2InferModelOnline
from
.deepspeech2
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.utils
import
dynamic_pip_install
import
sys
try
:
import
paddlespeech_ctcdecoders
except
ImportError
:
try
:
package_name
=
'paddlespeech_ctcdecoders'
if
sys
.
platform
!=
"win32"
:
dynamic_pip_install
.
install
(
package_name
)
except
Exception
:
raise
RuntimeError
(
"Can not install package paddlespeech_ctcdecoders on your system.
\
The DeepSpeech2 model is not supported for your system"
)
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModelOnline'
]
paddlespeech/s2t/models/ds2_online/conv.py
已删除
100644 → 0
浏览文件 @
0fa32e4a
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddlespeech.s2t.modules.subsampling
import
Conv2dSubsampling4
class
Conv2dSubsampling4Online
(
Conv2dSubsampling4
):
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
):
super
().
__init__
(
idim
,
odim
,
dropout_rate
,
None
)
self
.
output_dim
=
((
idim
-
1
)
//
2
-
1
)
//
2
*
odim
self
.
receptive_field_length
=
2
*
(
3
-
1
)
+
3
# stride_1 * (kernel_size_2 - 1) + kerel_size_1
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
)
->
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
x
=
self
.
conv
(
x
)
#b, c, t, f = paddle.shape(x) #not work under jit
x
=
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
0
,
0
,
-
1
])
x_len
=
((
x_len
-
1
)
//
2
-
1
)
//
2
return
x
,
x_len
paddlespeech/s2t/models/ds2_online/deepspeech2.py
已删除
100644 → 0
浏览文件 @
0fa32e4a
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Deepspeech2 ASR Online Model"""
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddlespeech.s2t.models.ds2_online.conv
import
Conv2dSubsampling4Online
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModelOnline'
]
class
CRNNEncoder
(
nn
.
Layer
):
def
__init__
(
self
,
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
):
super
().
__init__
()
self
.
rnn_size
=
rnn_size
self
.
feat_size
=
feat_size
# 161 for linear
self
.
dict_size
=
dict_size
self
.
num_rnn_layers
=
num_rnn_layers
self
.
num_fc_layers
=
num_fc_layers
self
.
rnn_direction
=
rnn_direction
self
.
fc_layers_size_list
=
fc_layers_size_list
self
.
use_gru
=
use_gru
self
.
conv
=
Conv2dSubsampling4Online
(
feat_size
,
32
,
dropout_rate
=
0.0
)
self
.
output_dim
=
self
.
conv
.
output_dim
i_size
=
self
.
conv
.
output_dim
self
.
rnn
=
nn
.
LayerList
()
self
.
layernorm_list
=
nn
.
LayerList
()
self
.
fc_layers_list
=
nn
.
LayerList
()
if
rnn_direction
==
'bidirect'
or
rnn_direction
==
'bidirectional'
:
layernorm_size
=
2
*
rnn_size
elif
rnn_direction
==
'forward'
:
layernorm_size
=
rnn_size
else
:
raise
Exception
(
"Wrong rnn direction"
)
for
i
in
range
(
0
,
num_rnn_layers
):
if
i
==
0
:
rnn_input_size
=
i_size
else
:
rnn_input_size
=
layernorm_size
if
use_gru
is
True
:
self
.
rnn
.
append
(
nn
.
GRU
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
else
:
self
.
rnn
.
append
(
nn
.
LSTM
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
self
.
output_dim
=
layernorm_size
fc_input_size
=
layernorm_size
for
i
in
range
(
self
.
num_fc_layers
):
self
.
fc_layers_list
.
append
(
nn
.
Linear
(
fc_input_size
,
fc_layers_size_list
[
i
]))
fc_input_size
=
fc_layers_size_list
[
i
]
self
.
output_dim
=
fc_layers_size_list
[
i
]
@
property
def
output_size
(
self
):
return
self
.
output_dim
def
forward
(
self
,
x
,
x_lens
,
init_state_h_box
=
None
,
init_state_c_box
=
None
):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
Return:
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
if
init_state_h_box
is
not
None
:
init_state_list
=
None
if
self
.
use_gru
is
True
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
init_state_h_list
else
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
[(
init_state_h_list
[
i
],
init_state_c_list
[
i
])
for
i
in
range
(
self
.
num_rnn_layers
)]
else
:
init_state_list
=
[
None
]
*
self
.
num_rnn_layers
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
final_chunk_state_list
=
[]
for
i
in
range
(
0
,
self
.
num_rnn_layers
):
x
,
final_state
=
self
.
rnn
[
i
](
x
,
init_state_list
[
i
],
x_lens
)
#[B, T, D]
final_chunk_state_list
.
append
(
final_state
)
x
=
self
.
layernorm_list
[
i
](
x
)
for
i
in
range
(
self
.
num_fc_layers
):
x
=
self
.
fc_layers_list
[
i
](
x
)
x
=
F
.
relu
(
x
)
if
self
.
use_gru
is
True
:
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_list
,
axis
=
0
)
final_chunk_state_c_box
=
init_state_c_box
else
:
final_chunk_state_h_list
=
[
final_chunk_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_c_list
=
[
final_chunk_state_list
[
i
][
1
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_h_list
,
axis
=
0
)
final_chunk_state_c_box
=
paddle
.
concat
(
final_chunk_state_c_list
,
axis
=
0
)
return
x
,
x_lens
,
final_chunk_state_h_box
,
final_chunk_state_c_box
def
forward_chunk_by_chunk
(
self
,
x
,
x_lens
,
decoder_chunk_size
=
8
):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
decoder_chunk_size: The chunk size of decoder
Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate
=
self
.
conv
.
subsampling_rate
receptive_field_length
=
self
.
conv
.
receptive_field_length
chunk_size
=
(
decoder_chunk_size
-
1
)
*
subsampling_rate
+
receptive_field_length
chunk_stride
=
subsampling_rate
*
decoder_chunk_size
max_len
=
x
.
shape
[
1
]
assert
(
chunk_size
<=
max_len
)
eouts_chunk_list
=
[]
eouts_chunk_lens_list
=
[]
if
(
max_len
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len
=
chunk_stride
-
(
max_len
-
chunk_size
)
%
chunk_stride
else
:
padding_len
=
0
padding
=
paddle
.
zeros
((
x
.
shape
[
0
],
padding_len
,
x
.
shape
[
2
]))
padded_x
=
paddle
.
concat
([
x
,
padding
],
axis
=
1
)
num_chunk
=
(
max_len
+
padding_len
-
chunk_size
)
/
chunk_stride
+
1
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
None
chunk_state_c_box
=
None
final_state_h_box
=
None
final_state_c_box
=
None
for
i
in
range
(
0
,
num_chunk
):
start
=
i
*
chunk_stride
end
=
start
+
chunk_size
x_chunk
=
padded_x
[:,
start
:
end
,
:]
x_len_left
=
paddle
.
where
(
x_lens
-
i
*
chunk_stride
<
0
,
paddle
.
zeros_like
(
x_lens
),
x_lens
-
i
*
chunk_stride
)
x_chunk_len_tmp
=
paddle
.
ones_like
(
x_lens
)
*
chunk_size
x_chunk_lens
=
paddle
.
where
(
x_len_left
<
x_chunk_len_tmp
,
x_len_left
,
x_chunk_len_tmp
)
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
(
x_chunk
,
x_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
eouts_chunk_list
.
append
(
eouts_chunk
)
eouts_chunk_lens_list
.
append
(
eouts_chunk_lens
)
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
return
eouts_chunk_list
,
eouts_chunk_lens_list
,
final_state_h_box
,
final_state_c_box
class
DeepSpeech2ModelOnline
(
nn
.
Layer
):
"""The DeepSpeech2 network structure for online.
:param audio: Audio spectrogram data layer.
:type audio: Variable
:param text: Transcription text data layer.
:type text: Variable
:param audio_len: Valid sequence length data layer.
:type audio_len: Variable
:param feat_size: feature size for audio.
:type feat_size: int
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
def
__init__
(
self
,
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
,
blank_id
=
0
,
ctc_grad_norm_type
=
None
,
):
super
().
__init__
()
self
.
encoder
=
CRNNEncoder
(
feat_size
=
feat_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_direction
=
rnn_direction
,
num_fc_layers
=
num_fc_layers
,
fc_layers_size_list
=
fc_layers_size_list
,
rnn_size
=
rnn_size
,
use_gru
=
use_gru
)
self
.
decoder
=
CTCDecoder
(
odim
=
dict_size
,
# <blank> is in vocab
enc_n_units
=
self
.
encoder
.
output_size
,
blank_id
=
blank_id
,
dropout_rate
=
0.0
,
reduction
=
True
,
# sum
batch_average
=
True
,
# sum / batch_size
grad_norm_type
=
ctc_grad_norm_type
)
def
forward
(
self
,
audio
,
audio_len
,
text
,
text_len
):
"""Compute Model loss
Args:
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
text (Tensor): [B, U]
text_len (Tensor): [B]
Returns:
loss (Tensor): [1]
"""
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
return
loss
@
paddle
.
no_grad
()
def
decode
(
self
,
audio
,
audio_len
):
# decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
batch_size
=
probs
.
shape
[
0
]
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
return
trans_best
@
classmethod
def
from_pretrained
(
cls
,
dataloader
,
config
,
checkpoint_path
):
"""Build a DeepSpeech2Model model from a pretrained model.
Parameters
----------
dataloader: paddle.io.DataLoader
config: yacs.config.CfgNode
model configs
checkpoint_path: Path or str
the path of pretrained model checkpoint, without extension name
Returns
-------
DeepSpeech2ModelOnline
The model built from pretrained result.
"""
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
layer_tools
.
summary
(
model
)
return
model
@
classmethod
def
from_config
(
cls
,
config
):
"""Build a DeepSpeec2ModelOnline from config
Parameters
config: yacs.config.CfgNode
config
Returns
-------
DeepSpeech2ModelOnline
The model built from config.
"""
model
=
cls
(
feat_size
=
config
.
input_dim
,
dict_size
=
config
.
output_dim
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
return
model
class
DeepSpeech2InferModelOnline
(
DeepSpeech2ModelOnline
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
):
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
def
export
(
self
):
static_model
=
paddle
.
jit
.
to_static
(
self
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
)
])
return
static_model
paddlespeech/server/engine/asr/online/asr_engine.py
浏览文件 @
47dd61e5
...
@@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
...
@@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
from
paddlespeech.cli.utils
import
MODEL_HOME
from
paddlespeech.cli.utils
import
MODEL_HOME
from
paddlespeech.resource
import
CommonTaskResource
from
paddlespeech.resource
import
CommonTaskResource
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.speech
import
SpeechSegment
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.transform.transformation
import
Transformation
from
paddlespeech.s2t.transform.transformation
import
Transformation
from
paddlespeech.s2t.utils.tensor_utils
import
add_sos_eos
from
paddlespeech.s2t.utils.tensor_utils
import
add_sos_eos
...
@@ -66,10 +65,13 @@ class PaddleASRConnectionHanddler:
...
@@ -66,10 +65,13 @@ class PaddleASRConnectionHanddler:
self
.
text_feature
=
self
.
asr_engine
.
executor
.
text_feature
self
.
text_feature
=
self
.
asr_engine
.
executor
.
text_feature
if
"deepspeech2"
in
self
.
model_type
:
if
"deepspeech2"
in
self
.
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
am_predictor
=
self
.
asr_engine
.
executor
.
am_predictor
self
.
am_predictor
=
self
.
asr_engine
.
executor
.
am_predictor
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
model_config
)
# extract feat, new only fbank in conformer model
self
.
preprocess_conf
=
self
.
model_config
.
preprocess_config
self
.
preprocess_args
=
{
"train"
:
False
}
self
.
preprocessing
=
Transformation
(
self
.
preprocess_conf
)
self
.
decoder
=
CTCDecoder
(
self
.
decoder
=
CTCDecoder
(
odim
=
self
.
model_config
.
output_dim
,
# <blank> is in vocab
odim
=
self
.
model_config
.
output_dim
,
# <blank> is in vocab
enc_n_units
=
self
.
model_config
.
rnn_layer_size
*
2
,
enc_n_units
=
self
.
model_config
.
rnn_layer_size
*
2
,
...
@@ -89,10 +91,8 @@ class PaddleASRConnectionHanddler:
...
@@ -89,10 +91,8 @@ class PaddleASRConnectionHanddler:
cfg
.
num_proc_bsearch
)
cfg
.
num_proc_bsearch
)
# frame window and frame shift, in samples unit
# frame window and frame shift, in samples unit
self
.
win_length
=
int
(
self
.
model_config
.
window_ms
/
1000
*
self
.
win_length
=
self
.
preprocess_conf
.
process
[
0
][
'win_length'
]
self
.
sample_rate
)
self
.
n_shift
=
self
.
preprocess_conf
.
process
[
0
][
'n_shift'
]
self
.
n_shift
=
int
(
self
.
model_config
.
stride_ms
/
1000
*
self
.
sample_rate
)
elif
"conformer"
in
self
.
model_type
or
"transformer"
in
self
.
model_type
:
elif
"conformer"
in
self
.
model_type
or
"transformer"
in
self
.
model_type
:
# acoustic model
# acoustic model
...
@@ -123,11 +123,6 @@ class PaddleASRConnectionHanddler:
...
@@ -123,11 +123,6 @@ class PaddleASRConnectionHanddler:
samples
=
np
.
frombuffer
(
samples
,
dtype
=
np
.
int16
)
samples
=
np
.
frombuffer
(
samples
,
dtype
=
np
.
int16
)
assert
samples
.
ndim
==
1
assert
samples
.
ndim
==
1
# pcm16 -> pcm 32
# pcm2float will change the orignal samples,
# so we shoule do pcm2float before concatenate
samples
=
pcm2float
(
samples
)
if
self
.
remained_wav
is
None
:
if
self
.
remained_wav
is
None
:
self
.
remained_wav
=
samples
self
.
remained_wav
=
samples
else
:
else
:
...
@@ -137,26 +132,11 @@ class PaddleASRConnectionHanddler:
...
@@ -137,26 +132,11 @@ class PaddleASRConnectionHanddler:
f
"The connection remain the audio samples:
{
self
.
remained_wav
.
shape
}
"
f
"The connection remain the audio samples:
{
self
.
remained_wav
.
shape
}
"
)
)
# read audio
# fbank
speech_segment
=
SpeechSegment
.
from_pcm
(
feat
=
self
.
preprocessing
(
self
.
remained_wav
,
self
.
remained_wav
,
self
.
sample_rate
,
transcript
=
" "
)
**
self
.
preprocess_args
)
# audio augment
feat
=
paddle
.
to_tensor
(
self
.
collate_fn_test
.
augmentation
.
transform_audio
(
speech_segment
)
feat
,
dtype
=
"float32"
).
unsqueeze
(
axis
=
0
)
# extract speech feature
spectrum
,
transcript_part
=
self
.
collate_fn_test
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
collate_fn_test
.
keep_transcription_text
)
# CMVN spectrum
if
self
.
collate_fn_test
.
_normalizer
:
spectrum
=
self
.
collate_fn_test
.
_normalizer
.
apply
(
spectrum
)
# spectrum augment
feat
=
self
.
collate_fn_test
.
augmentation
.
transform_feature
(
spectrum
)
# audio_len is frame num
frame_num
=
feat
.
shape
[
0
]
feat
=
paddle
.
to_tensor
(
feat
,
dtype
=
'float32'
)
feat
=
paddle
.
unsqueeze
(
feat
,
axis
=
0
)
if
self
.
cached_feat
is
None
:
if
self
.
cached_feat
is
None
:
self
.
cached_feat
=
feat
self
.
cached_feat
=
feat
...
@@ -170,8 +150,11 @@ class PaddleASRConnectionHanddler:
...
@@ -170,8 +150,11 @@ class PaddleASRConnectionHanddler:
if
self
.
device
is
None
:
if
self
.
device
is
None
:
self
.
device
=
self
.
cached_feat
.
place
self
.
device
=
self
.
cached_feat
.
place
self
.
num_frames
+=
frame_num
# cur frame step
self
.
remained_wav
=
self
.
remained_wav
[
self
.
n_shift
*
frame_num
:]
num_frames
=
feat
.
shape
[
1
]
self
.
num_frames
+=
num_frames
self
.
remained_wav
=
self
.
remained_wav
[
self
.
n_shift
*
num_frames
:]
logger
.
info
(
logger
.
info
(
f
"process the audio feature success, the connection feat shape:
{
self
.
cached_feat
.
shape
}
"
f
"process the audio feature success, the connection feat shape:
{
self
.
cached_feat
.
shape
}
"
...
@@ -752,16 +735,19 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -752,16 +735,19 @@ class ASRServerExecutor(ASRExecutor):
self
.
config
=
CfgNode
(
new_allowed
=
True
)
self
.
config
=
CfgNode
(
new_allowed
=
True
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
vocab
=
self
.
config
.
vocab_filepath
with
UpdateConfig
(
self
.
config
):
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2"
in
model_type
:
if
"deepspeech2"
in
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
...
@@ -772,14 +758,6 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -772,14 +758,6 @@ class ASRServerExecutor(ASRExecutor):
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
logger
.
info
(
"start to create the stream conformer asr engine"
)
logger
.
info
(
"start to create the stream conformer asr engine"
)
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
# update the decoding method
# update the decoding method
if
decode_method
:
if
decode_method
:
self
.
config
.
decode
.
decoding_method
=
decode_method
self
.
config
.
decode
.
decoding_method
=
decode_method
...
...
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
浏览文件 @
47dd61e5
...
@@ -54,6 +54,7 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -54,6 +54,7 @@ class ASRServerExecutor(ASRExecutor):
sample_rate_str
=
'16k'
if
sample_rate
==
16000
else
'8k'
sample_rate_str
=
'16k'
if
sample_rate
==
16000
else
'8k'
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
self
.
max_len
=
50
self
.
task_resource
.
set_task_model
(
model_tag
=
tag
)
self
.
task_resource
.
set_task_model
(
model_tag
=
tag
)
if
cfg_path
is
None
or
am_model
is
None
or
am_params
is
None
:
if
cfg_path
is
None
or
am_model
is
None
or
am_params
is
None
:
self
.
res_path
=
self
.
task_resource
.
res_dir
self
.
res_path
=
self
.
task_resource
.
res_dir
...
@@ -80,22 +81,24 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -80,22 +81,24 @@ class ASRServerExecutor(ASRExecutor):
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
with
UpdateConfig
(
self
.
config
):
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
if
"deepspeech2"
in
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
vocab
=
self
.
config
.
vocab_filepath
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
self
.
download_lm
(
self
.
download_lm
(
lm_url
,
lm_url
,
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
or
"wenetspeech"
in
model_type
:
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
raise
Exception
(
"wrong type"
)
raise
Exception
(
"wrong type"
)
else
:
else
:
raise
Exception
(
"wrong type"
)
raise
Exception
(
"wrong type"
)
...
@@ -125,7 +128,7 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -125,7 +128,7 @@ class ASRServerExecutor(ASRExecutor):
cfg
=
self
.
config
.
decode
cfg
=
self
.
config
.
decode
audio
=
self
.
_inputs
[
"audio"
]
audio
=
self
.
_inputs
[
"audio"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
if
"deepspeech2
online"
in
model_type
or
"deepspeech2offline
"
in
model_type
:
if
"deepspeech2"
in
model_type
:
decode_batch_size
=
audio
.
shape
[
0
]
decode_batch_size
=
audio
.
shape
[
0
]
# init once
# init once
self
.
decoder
.
init_decoder
(
self
.
decoder
.
init_decoder
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录