Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
919c8d06
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
919c8d06
编写于
6月 05, 2022
作者:
L
liangym
提交者:
lym0302
6月 05, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'PaddlePaddle:develop' into update_engine
上级
8b1c1ec4
eea56a4a
变更
74
展开全部
隐藏空白更改
内联
并排
Showing
74 changed file
with
1011 addition
and
55785 deletion
+1011
-55785
demos/streaming_asr_server/web/templates/index.html
demos/streaming_asr_server/web/templates/index.html
+1
-0
docker/ubuntu18-cpu/Dockerfile
docker/ubuntu18-cpu/Dockerfile
+15
-0
examples/aishell/asr0/conf/augmentation.json
examples/aishell/asr0/conf/augmentation.json
+0
-36
examples/aishell/asr0/conf/deepspeech2.yaml
examples/aishell/asr0/conf/deepspeech2.yaml
+31
-28
examples/aishell/asr0/conf/deepspeech2_online.yaml
examples/aishell/asr0/conf/deepspeech2_online.yaml
+19
-20
examples/aishell/asr0/conf/preprocess.yaml
examples/aishell/asr0/conf/preprocess.yaml
+25
-0
examples/aishell/asr0/conf/tuning/decode.yaml
examples/aishell/asr0/conf/tuning/decode.yaml
+3
-3
examples/aishell/asr0/local/data.sh
examples/aishell/asr0/local/data.sh
+4
-3
examples/aishell/asr0/local/export.sh
examples/aishell/asr0/local/export.sh
+3
-5
examples/aishell/asr0/local/test.sh
examples/aishell/asr0/local/test.sh
+13
-15
examples/aishell/asr0/local/test_export.sh
examples/aishell/asr0/local/test_export.sh
+3
-5
examples/aishell/asr0/local/test_wav.sh
examples/aishell/asr0/local/test_wav.sh
+3
-5
examples/aishell/asr0/local/train.sh
examples/aishell/asr0/local/train.sh
+2
-5
examples/aishell/asr0/run.sh
examples/aishell/asr0/run.sh
+6
-7
examples/librispeech/asr0/conf/augmentation.json
examples/librispeech/asr0/conf/augmentation.json
+0
-36
examples/librispeech/asr0/conf/deepspeech2.yaml
examples/librispeech/asr0/conf/deepspeech2.yaml
+29
-29
examples/librispeech/asr0/conf/deepspeech2_online.yaml
examples/librispeech/asr0/conf/deepspeech2_online.yaml
+28
-31
examples/librispeech/asr0/conf/preprocess.yaml
examples/librispeech/asr0/conf/preprocess.yaml
+25
-0
examples/librispeech/asr0/local/data.sh
examples/librispeech/asr0/local/data.sh
+4
-3
examples/librispeech/asr0/local/export.sh
examples/librispeech/asr0/local/export.sh
+3
-5
examples/librispeech/asr0/local/test.sh
examples/librispeech/asr0/local/test.sh
+40
-13
examples/librispeech/asr0/local/test_wav.sh
examples/librispeech/asr0/local/test_wav.sh
+3
-5
examples/librispeech/asr0/local/train.sh
examples/librispeech/asr0/local/train.sh
+2
-5
examples/librispeech/asr0/run.sh
examples/librispeech/asr0/run.sh
+11
-7
examples/tiny/asr0/conf/augmentation.json
examples/tiny/asr0/conf/augmentation.json
+0
-36
examples/tiny/asr0/conf/deepspeech2.yaml
examples/tiny/asr0/conf/deepspeech2.yaml
+21
-20
examples/tiny/asr0/conf/deepspeech2_online.yaml
examples/tiny/asr0/conf/deepspeech2_online.yaml
+19
-20
examples/tiny/asr0/conf/preprocess.yaml
examples/tiny/asr0/conf/preprocess.yaml
+25
-0
examples/tiny/asr0/local/export.sh
examples/tiny/asr0/local/export.sh
+3
-5
examples/tiny/asr0/local/test.sh
examples/tiny/asr0/local/test.sh
+3
-5
examples/tiny/asr0/local/train.sh
examples/tiny/asr0/local/train.sh
+2
-5
examples/tiny/asr0/run.sh
examples/tiny/asr0/run.sh
+3
-5
paddlespeech/cli/asr/infer.py
paddlespeech/cli/asr/infer.py
+12
-27
paddlespeech/resource/model_alias.py
paddlespeech/resource/model_alias.py
+1
-1
paddlespeech/resource/pretrained_models.py
paddlespeech/resource/pretrained_models.py
+16
-16
paddlespeech/s2t/exps/deepspeech2/bin/export.py
paddlespeech/s2t/exps/deepspeech2/bin/export.py
+1
-4
paddlespeech/s2t/exps/deepspeech2/bin/test.py
paddlespeech/s2t/exps/deepspeech2/bin/test.py
+1
-4
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+0
-3
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+1
-10
paddlespeech/s2t/exps/deepspeech2/bin/train.py
paddlespeech/s2t/exps/deepspeech2/bin/train.py
+0
-3
paddlespeech/s2t/exps/deepspeech2/model.py
paddlespeech/s2t/exps/deepspeech2/model.py
+81
-93
paddlespeech/s2t/models/ds2/conv.py
paddlespeech/s2t/models/ds2/conv.py
+14
-152
paddlespeech/s2t/models/ds2/deepspeech2.py
paddlespeech/s2t/models/ds2/deepspeech2.py
+242
-92
paddlespeech/s2t/models/ds2/rnn.py
paddlespeech/s2t/models/ds2/rnn.py
+0
-315
paddlespeech/s2t/models/ds2_online/__init__.py
paddlespeech/s2t/models/ds2_online/__init__.py
+0
-31
paddlespeech/s2t/models/ds2_online/conv.py
paddlespeech/s2t/models/ds2_online/conv.py
+0
-33
paddlespeech/s2t/models/ds2_online/deepspeech2.py
paddlespeech/s2t/models/ds2_online/deepspeech2.py
+0
-397
paddlespeech/server/engine/asr/online/asr_engine.py
paddlespeech/server/engine/asr/online/asr_engine.py
+34
-56
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+12
-9
paddlespeech/server/engine/tts/online/python/tts_engine.py
paddlespeech/server/engine/tts/online/python/tts_engine.py
+1
-1
speechx/CMakeLists.txt
speechx/CMakeLists.txt
+0
-1
speechx/examples/custom_asr/run.sh
speechx/examples/custom_asr/run.sh
+0
-1
speechx/examples/ds2_ol/README.md
speechx/examples/ds2_ol/README.md
+2
-10
speechx/examples/ds2_ol/aishell/path.sh
speechx/examples/ds2_ol/aishell/path.sh
+1
-1
speechx/examples/ds2_ol/aishell/run.sh
speechx/examples/ds2_ol/aishell/run.sh
+0
-2
speechx/examples/ds2_ol/aishell/run_fbank.sh
speechx/examples/ds2_ol/aishell/run_fbank.sh
+0
-1
speechx/examples/ds2_ol/websocket/path.sh
speechx/examples/ds2_ol/websocket/path.sh
+1
-1
speechx/examples/ds2_ol/websocket/websocket_client.sh
speechx/examples/ds2_ol/websocket/websocket_client.sh
+1
-1
speechx/examples/ds2_ol/websocket/websocket_server.sh
speechx/examples/ds2_ol/websocket/websocket_server.sh
+0
-15
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+2
-4
speechx/speechx/decoder/recognizer_main.cc
speechx/speechx/decoder/recognizer_main.cc
+2
-1
speechx/speechx/frontend/audio/audio_cache.h
speechx/speechx/frontend/audio/audio_cache.h
+3
-2
speechx/speechx/frontend/audio/compute_fbank_main.cc
speechx/speechx/frontend/audio/compute_fbank_main.cc
+5
-6
speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
...speechx/frontend/audio/compute_linear_spectrogram_main.cc
+0
-1
speechx/speechx/frontend/audio/fbank.cc
speechx/speechx/frontend/audio/fbank.cc
+20
-83
speechx/speechx/frontend/audio/fbank.h
speechx/speechx/frontend/audio/fbank.h
+15
-42
speechx/speechx/frontend/audio/feature_common.h
speechx/speechx/frontend/audio/feature_common.h
+54
-0
speechx/speechx/frontend/audio/feature_common_inl.h
speechx/speechx/frontend/audio/feature_common_inl.h
+95
-0
speechx/speechx/frontend/audio/feature_pipeline.h
speechx/speechx/frontend/audio/feature_pipeline.h
+1
-1
speechx/speechx/frontend/audio/linear_spectrogram.cc
speechx/speechx/frontend/audio/linear_spectrogram.cc
+21
-70
speechx/speechx/frontend/audio/linear_spectrogram.h
speechx/speechx/frontend/audio/linear_spectrogram.h
+22
-34
speechx/speechx/utils/CMakeLists.txt
speechx/speechx/utils/CMakeLists.txt
+1
-2
speechx/speechx/utils/simdjson.cpp
speechx/speechx/utils/simdjson.cpp
+0
-16016
speechx/speechx/utils/simdjson.h
speechx/speechx/utils/simdjson.h
+0
-37881
未找到文件。
demos/streaming_asr_server/web/templates/index.html
浏览文件 @
919c8d06
...
@@ -93,6 +93,7 @@
...
@@ -93,6 +93,7 @@
function
parseResult
(
data
)
{
function
parseResult
(
data
)
{
var
data
=
JSON
.
parse
(
data
)
var
data
=
JSON
.
parse
(
data
)
console
.
log
(
'
result json:
'
,
data
)
var
result
=
data
.
result
var
result
=
data
.
result
console
.
log
(
result
)
console
.
log
(
result
)
$
(
"
#resultPanel
"
).
html
(
result
)
$
(
"
#resultPanel
"
).
html
(
result
)
...
...
docker/ubuntu18-cpu/Dockerfile
0 → 100644
浏览文件 @
919c8d06
FROM
registry.baidubce.com/paddlepaddle/paddle:2.2.2
LABEL
maintainer="paddlesl@baidu.com"
RUN
git clone
--depth
1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech
RUN
pip3 uninstall mccabe
-y
;
exit
0
;
RUN
pip3
install
multiprocess
==
0.70.12 importlib-metadata
==
4.2.0
dill
==
0.3.4
RUN
cd
/home/PaddleSpeech/audio
RUN
python setup.py bdist_wheel
RUN
cd
/home/PaddleSpeech
RUN
python setup.py bdist_wheel
RUN
pip
install
audio/dist/
*
.whl dist/
*
.whl
WORKDIR
/home/PaddleSpeech/
examples/aishell/asr0/conf/augmentation.json
已删除
100644 → 0
浏览文件 @
8b1c1ec4
[
{
"type"
:
"speed"
,
"params"
:
{
"min_speed_rate"
:
0.9
,
"max_speed_rate"
:
1.1
,
"num_rates"
:
3
},
"prob"
:
0.0
},
{
"type"
:
"shift"
,
"params"
:
{
"min_shift_ms"
:
-5
,
"max_shift_ms"
:
5
},
"prob"
:
1.0
},
{
"type"
:
"specaug"
,
"params"
:
{
"W"
:
0
,
"warp_mode"
:
"PIL"
,
"F"
:
10
,
"n_freq_masks"
:
2
,
"T"
:
50
,
"n_time_masks"
:
2
,
"p"
:
1.0
,
"adaptive_number_ratio"
:
0
,
"adaptive_size_ratio"
:
0
,
"max_n_time_masks"
:
20
,
"replace_with_zero"
:
true
},
"prob"
:
1.0
}
]
examples/aishell/asr0/conf/deepspeech2.yaml
浏览文件 @
919c8d06
...
@@ -15,50 +15,53 @@ max_output_input_ratio: .inf
...
@@ -15,50 +15,53 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
spm_model_prefix
:
'
'
random_seed
:
0
unit_type
:
'
char'
spm_model_prefix
:
preprocess_config
:
conf/preprocess.yaml
spectrum_type
:
linear
feat_dim
:
161
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
n_fft
:
None
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq
:
None
batch_size
:
64
target_sample_rate
:
16000
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
use_dB_normalization
:
True
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
target_dB
:
-20
minibatches
:
0
# for debug
dither
:
1.0
batch_count
:
auto
keep_transcription_text
:
False
batch_bins
:
0
sortagrad
:
True
batch_frames_in
:
0
shuffle_method
:
batch_shuffle
batch_frames_out
:
0
num_workers
:
2
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
num_conv_layers
:
2
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
5
rnn_layer_size
:
1024
rnn_layer_size
:
1024
use_gru
:
True
rnn_direction
:
bidirect
# [forward, bidirect]
share_rnn_weights
:
False
num_fc_layers
:
0
fc_layers_size_list
:
-1,
use_gru
:
False
blank_id
:
0
blank_id
:
0
ctc_grad_norm_type
:
instance
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
8
0
n_epoch
:
5
0
accum_grad
:
1
accum_grad
:
1
lr
:
2.0e-3
lr
:
5.0e-4
lr_decay
:
0.
8
3
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
3.0
global_grad_clip
:
3.0
log_interval
:
100
dist_sampler
:
False
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
examples/aishell/asr0/conf/deepspeech2_online.yaml
浏览文件 @
919c8d06
...
@@ -15,28 +15,26 @@ max_output_input_ratio: .inf
...
@@ -15,28 +15,26 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
spm_model_prefix
:
'
'
random_seed
:
0
unit_type
:
'
char'
spm_model_prefix
:
preprocess_config
:
conf/preprocess.yaml
spectrum_type
:
linear
#linear, mfcc, fbank
feat_dim
:
161
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
n_fft
:
None
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq
:
None
batch_size
:
64
target_sample_rate
:
16000
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
use_dB_normalization
:
True
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
target_dB
:
-20
minibatches
:
0
# for debug
dither
:
1.0
batch_count
:
auto
keep_transcription_text
:
False
batch_bins
:
0
sortagrad
:
True
batch_frames_in
:
0
shuffle_method
:
batch_shuffle
batch_frames_out
:
0
num_workers
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
...
@@ -54,12 +52,13 @@ blank_id: 0
...
@@ -54,12 +52,13 @@ blank_id: 0
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
65
n_epoch
:
30
accum_grad
:
1
accum_grad
:
1
lr
:
5.0e-4
lr
:
5.0e-4
lr_decay
:
0.93
lr_decay
:
0.93
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
3.0
global_grad_clip
:
3.0
dist_sampler
:
False
log_interval
:
100
log_interval
:
100
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
...
...
examples/aishell/asr0/conf/preprocess.yaml
0 → 100644
浏览文件 @
919c8d06
process
:
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
fs
:
16000
n_mels
:
161
n_shift
:
160
win_length
:
400
dither
:
0.1
-
type
:
cmvn_json
cmvn_path
:
data/mean_std.json
# these three processes are a.k.a. SpecAugument
-
type
:
time_warp
max_time_warp
:
5
inplace
:
true
mode
:
PIL
-
type
:
freq_mask
F
:
30
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
-
type
:
time_mask
T
:
40
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
examples/aishell/asr0/conf/tuning/decode.yaml
浏览文件 @
919c8d06
...
@@ -2,9 +2,9 @@ decode_batch_size: 128
...
@@ -2,9 +2,9 @@ decode_batch_size: 128
error_rate_type
:
cer
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
1.9
alpha
:
2.2
beta
:
5.0
beta
:
4.3
beam_size
:
3
00
beam_size
:
5
00
cutoff_prob
:
0.99
cutoff_prob
:
0.99
cutoff_top_n
:
40
cutoff_top_n
:
40
num_proc_bsearch
:
10
num_proc_bsearch
:
10
examples/aishell/asr0/local/data.sh
浏览文件 @
919c8d06
...
@@ -33,12 +33,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -33,12 +33,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
num_workers
=
$(
nproc
)
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--spectrum_type
=
"linear"
\
--spectrum_type
=
"fbank"
\
--feat_dim
=
161
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--stride_ms
=
10
\
--stride_ms
=
10
\
--window_ms
=
2
0
\
--window_ms
=
2
5
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
--use_dB_normalization
=
Tru
e
\
--use_dB_normalization
=
Fals
e
\
--num_samples
=
2000
\
--num_samples
=
2000
\
--num_workers
=
${
num_workers
}
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
--output_path
=
"data/mean_std.json"
...
...
examples/aishell/asr0/local/export.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path
model_type
"
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
exit
-1
exit
-1
fi
fi
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
ckpt_path_prefix
=
$2
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
jit_model_export_path
=
$3
model_type
=
$4
python3
-u
${
BIN_DIR
}
/export.py
\
python3
-u
${
BIN_DIR
}
/export.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--export_path
${
jit_model_export_path
}
\
--export_path
${
jit_model_export_path
}
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in export!"
echo
"Failed in export!"
...
...
examples/aishell/asr0/local/test.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -13,7 +13,6 @@ echo "using $ngpu gpus..."
...
@@ -13,7 +13,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
decode_config_path
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_ch.sh
bash
local
/download_lm_ch.sh
...
@@ -23,7 +22,7 @@ fi
...
@@ -23,7 +22,7 @@ fi
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# format the reference test file
# format the reference test file
python utils/format_rsl.py
\
python
3
utils/format_rsl.py
\
--origin_ref
data/manifest.test.raw
\
--origin_ref
data/manifest.test.raw
\
--trans_ref
data/manifest.test.text
--trans_ref
data/manifest.test.text
...
@@ -32,8 +31,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -32,8 +31,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
@@ -41,25 +39,25 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -41,25 +39,25 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
fi
# format the hyp file
# format the hyp file
python utils/format_rsl.py
\
python
3
utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp
${
ckpt_prefix
}
.rsl.text
--trans_hyp
${
ckpt_prefix
}
.rsl.text
python utils/compute-wer.py
--char
=
1
--v
=
1
\
python
3
utils/compute-wer.py
--char
=
1
--v
=
1
\
data/manifest.test.text
${
ckpt_prefix
}
.rsl.text
>
${
ckpt_prefix
}
.error
data/manifest.test.text
${
ckpt_prefix
}
.rsl.text
>
${
ckpt_prefix
}
.error
fi
fi
if
[
${
stage
}
-le
101
]
&&
[
${
stop_stage
}
-ge
101
]
;
then
if
[
${
stage
}
-le
101
]
&&
[
${
stop_stage
}
-ge
101
]
;
then
python utils/format_rsl.py
\
python
3
utils/format_rsl.py
\
--origin_ref
data/manifest.test.raw
\
--origin_ref
data/manifest.test.raw
\
--trans_ref_sclite
data/manifest.test.text.sclite
--trans_ref_sclite
data/manifest.test.text.sclite
python
utils/format_rsl.py
\
python3
utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp_sclite
${
ckpt_prefix
}
.rsl.text.sclite
--trans_hyp_sclite
${
ckpt_prefix
}
.rsl.text.sclite
mkdir
-p
${
ckpt_prefix
}
_sclite
mkdir
-p
${
ckpt_prefix
}
_sclite
sclite
-i
wsj
-r
data/manifest.test.text.sclite
-h
${
ckpt_prefix
}
.rsl.text.sclite
-e
utf-8
-o
all
-O
${
ckpt_prefix
}
_sclite
-c
NOASCII
sclite
-i
wsj
-r
data/manifest.test.text.sclite
-h
${
ckpt_prefix
}
.rsl.text.sclite
-e
utf-8
-o
all
-O
${
ckpt_prefix
}
_sclite
-c
NOASCII
fi
fi
exit
0
exit
0
examples/aishell/asr0/local/test_export.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
...
@@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
decode_config_path
=
$2
decode_config_path
=
$2
jit_model_export_path
=
$3
jit_model_export_path
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_ch.sh
>
/dev/null 2>&1
bash
local
/download_lm_ch.sh
>
/dev/null 2>&1
...
@@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test_export.py \
...
@@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test_export.py \
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
jit_model_export_path
}
.rsl
\
--result_file
${
jit_model_export_path
}
.rsl
\
--export_path
${
jit_model_export_path
}
\
--export_path
${
jit_model_export_path
}
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/aishell/asr0/local/test_wav.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
5
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
audio_file"
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix audio_file"
exit
-1
exit
-1
fi
fi
...
@@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
...
@@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
decode_config_path
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
ckpt_prefix
=
$3
model_type
=
$4
audio_file
=
$4
audio_file
=
$5
mkdir
-p
data
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
...
@@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
...
@@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
--decode_cfg
${
decode_config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
\
--audio_file
${
audio_file
}
--audio_file
${
audio_file
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
...
...
examples/aishell/asr0/local/train.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
2
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name
model_type
"
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
exit
-1
fi
fi
...
@@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
...
@@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
ckpt_name
=
$2
ckpt_name
=
$2
model_type
=
$3
mkdir
-p
exp
mkdir
-p
exp
...
@@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
...
@@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
--seed
${
seed
}
else
else
python3
-m
paddle.distributed.launch
--gpus
=
${
CUDA_VISIBLE_DEVICES
}
${
BIN_DIR
}
/train.py
\
python3
-m
paddle.distributed.launch
--gpus
=
${
CUDA_VISIBLE_DEVICES
}
${
BIN_DIR
}
/train.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
--seed
${
seed
}
fi
fi
...
...
examples/aishell/asr0/run.sh
浏览文件 @
919c8d06
...
@@ -7,8 +7,7 @@ stage=0
...
@@ -7,8 +7,7 @@ stage=0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
#conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
conf_path
=
conf/deepspeech2.yaml
#conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
decode_conf_path
=
conf/tuning/decode.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
10
model_type
=
offline
# offline or online
audio_file
=
data/demo_01_03.wav
audio_file
=
data/demo_01_03.wav
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
@@ -25,7 +24,7 @@ fi
...
@@ -25,7 +24,7 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
fi
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
...
@@ -35,21 +34,21 @@ fi
...
@@ -35,21 +34,21 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
CUDA_VISIBLE_DEVICES
=
0 ./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test export ckpt avg_n
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit||
exit
-1
fi
fi
# Optionally, you can add LM and test it with runtime.
# Optionally, you can add LM and test it with runtime.
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
# test a single .wav file
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
fi
fi
examples/librispeech/asr0/conf/augmentation.json
已删除
100644 → 0
浏览文件 @
8b1c1ec4
[
{
"type"
:
"speed"
,
"params"
:
{
"min_speed_rate"
:
0.9
,
"max_speed_rate"
:
1.1
,
"num_rates"
:
3
},
"prob"
:
0.0
},
{
"type"
:
"shift"
,
"params"
:
{
"min_shift_ms"
:
-5
,
"max_shift_ms"
:
5
},
"prob"
:
1.0
},
{
"type"
:
"specaug"
,
"params"
:
{
"W"
:
0
,
"warp_mode"
:
"PIL"
,
"F"
:
10
,
"n_freq_masks"
:
2
,
"T"
:
50
,
"n_time_masks"
:
2
,
"p"
:
1.0
,
"adaptive_number_ratio"
:
0
,
"adaptive_size_ratio"
:
0
,
"max_n_time_masks"
:
20
,
"replace_with_zero"
:
true
},
"prob"
:
1.0
}
]
examples/librispeech/asr0/conf/deepspeech2.yaml
浏览文件 @
919c8d06
...
@@ -15,51 +15,51 @@ max_output_input_ratio: .inf
...
@@ -15,51 +15,51 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
20
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
spm_model_prefix
:
'
'
unit_type
:
char
unit_type
:
'
char'
vocab_filepath
:
data/lang_char/vocab.txt
preprocess_config
:
conf/preprocess.yaml
augmentation_config
:
conf/augmentation.json
feat_dim
:
161
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
feat_dim
:
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
delta_delta
:
False
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
dither
:
1.0
batch_size
:
64
use_dB_normalization
:
True
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
target_dB
:
-20
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
random_seed
:
0
minibatches
:
0
# for debug
keep_transcription_text
:
False
batch_count
:
auto
sortagrad
:
True
batch_bins
:
0
shuffle_method
:
batch_shuffle
batch_frames_in
:
0
num_workers
:
2
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
num_conv_layers
:
2
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
5
rnn_layer_size
:
2048
rnn_layer_size
:
1024
rnn_direction
:
bidirect
num_fc_layers
:
0
fc_layers_size_list
:
-1
use_gru
:
False
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
0
blank_id
:
0
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
50
n_epoch
:
15
accum_grad
:
1
accum_grad
:
1
lr
:
1.0e-3
lr
:
5.0e-4
lr_decay
:
0.
8
3
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
global_grad_clip
:
5.0
log_interval
:
100
dist_sampler
:
False
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
examples/librispeech/asr0/conf/deepspeech2_online.yaml
浏览文件 @
919c8d06
...
@@ -15,39 +15,36 @@ max_output_input_ratio: .inf
...
@@ -15,39 +15,36 @@ max_output_input_ratio: .inf
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
batch_size
:
15
vocab_filepath
:
data/lang_char/vocab.txt
mean_std_filepath
:
data/mean_std.json
spm_model_prefix
:
'
'
unit_type
:
char
unit_type
:
'
char'
vocab_filepath
:
data/lang_char/vocab.txt
preprocess_config
:
conf/preprocess.yaml
augmentation_config
:
conf/augmentation.json
feat_dim
:
161
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
feat_dim
:
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
delta_delta
:
False
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
dither
:
1.0
batch_size
:
64
use_dB_normalization
:
True
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
target_dB
:
-20
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
random_seed
:
0
minibatches
:
0
# for debug
keep_transcription_text
:
False
batch_count
:
auto
sortagrad
:
True
batch_bins
:
0
shuffle_method
:
batch_shuffle
batch_frames_in
:
0
num_workers
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
num_conv_layers
:
2
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
5
rnn_layer_size
:
2048
rnn_layer_size
:
1024
rnn_direction
:
forward
rnn_direction
:
forward
num_fc_layers
:
2
num_fc_layers
:
0
fc_layers_size_list
:
512,
256
fc_layers_size_list
:
-1
use_gru
:
False
use_gru
:
False
blank_id
:
0
blank_id
:
0
...
@@ -55,13 +52,13 @@ blank_id: 0
...
@@ -55,13 +52,13 @@ blank_id: 0
###########################################
###########################################
# Training #
# Training #
###########################################
###########################################
n_epoch
:
50
n_epoch
:
65
accum_grad
:
4
accum_grad
:
1
lr
:
1.0e-3
lr
:
5.0e-4
lr_decay
:
0.
8
3
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
global_grad_clip
:
5.0
log_interval
:
1
00
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
50
kbest_n
:
50
latest_n
:
5
latest_n
:
5
examples/librispeech/asr0/conf/preprocess.yaml
0 → 100644
浏览文件 @
919c8d06
process
:
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
fs
:
16000
n_mels
:
161
n_shift
:
160
win_length
:
400
dither
:
0.1
-
type
:
cmvn_json
cmvn_path
:
data/mean_std.json
# these three processes are a.k.a. SpecAugument
-
type
:
time_warp
max_time_warp
:
5
inplace
:
true
mode
:
PIL
-
type
:
freq_mask
F
:
30
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
-
type
:
time_mask
T
:
40
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
examples/librispeech/asr0/local/data.sh
浏览文件 @
919c8d06
...
@@ -49,12 +49,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -49,12 +49,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
2000
\
--num_samples
=
2000
\
--spectrum_type
=
"linear"
\
--spectrum_type
=
"fbank"
\
--feat_dim
=
161
\
--delta_delta
=
false
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--sample_rate
=
16000
\
--stride_ms
=
10
\
--stride_ms
=
10
\
--window_ms
=
2
0
\
--window_ms
=
2
5
\
--use_dB_normalization
=
Tru
e
\
--use_dB_normalization
=
Fals
e
\
--num_workers
=
${
num_workers
}
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
--output_path
=
"data/mean_std.json"
...
...
examples/librispeech/asr0/local/export.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path
model_type
"
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
exit
-1
exit
-1
fi
fi
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
ckpt_path_prefix
=
$2
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
jit_model_export_path
=
$3
model_type
=
$4
python3
-u
${
BIN_DIR
}
/export.py
\
python3
-u
${
BIN_DIR
}
/export.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--export_path
${
jit_model_export_path
}
\
--export_path
${
jit_model_export_path
}
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in export!"
echo
"Failed in export!"
...
...
examples/librispeech/asr0/local/test.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
stage
=
0
stop_stage
=
100
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
...
@@ -11,7 +13,6 @@ echo "using $ngpu gpus..."
...
@@ -11,7 +13,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
decode_config_path
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_en.sh
bash
local
/download_lm_en.sh
...
@@ -19,17 +20,43 @@ if [ $? -ne 0 ]; then
...
@@ -19,17 +20,43 @@ if [ $? -ne 0 ]; then
exit
1
exit
1
fi
fi
python3
-u
${
BIN_DIR
}
/test.py
\
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
--ngpu
${
ngpu
}
\
# format the reference test file
--config
${
config_path
}
\
python3 utils/format_rsl.py
\
--decode_cfg
${
decode_config_path
}
\
--origin_ref
data/manifest.test-clean.raw
\
--result_file
${
ckpt_prefix
}
.rsl
\
--trans_ref
data/manifest.test-clean.text
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
exit
1
fi
python3 utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp
${
ckpt_prefix
}
.rsl.text
python3 utils/compute-wer.py
--char
=
1
--v
=
1
\
data/manifest.test-clean.text
${
ckpt_prefix
}
.rsl.text
>
${
ckpt_prefix
}
.error
fi
if
[
$?
-ne
0
]
;
then
if
[
${
stage
}
-le
101
]
&&
[
${
stop_stage
}
-ge
101
]
;
then
echo
"Failed in evaluation!"
python3 utils/format_rsl.py
\
exit
1
--origin_ref
data/manifest.test-clean.raw
\
--trans_ref_sclite
data/manifest.test.text-clean.sclite
python3 utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp_sclite
${
ckpt_prefix
}
.rsl.text.sclite
mkdir
-p
${
ckpt_prefix
}
_sclite
sclite
-i
wsj
-r
data/manifest.test-clean.text.sclite
-h
${
ckpt_prefix
}
.rsl.text.sclite
-e
utf-8
-o
all
-O
${
ckpt_prefix
}
_sclite
-c
NOASCII
fi
fi
...
...
examples/librispeech/asr0/local/test_wav.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
5
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
audio_file"
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix audio_file"
exit
-1
exit
-1
fi
fi
...
@@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
...
@@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
decode_config_path
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
ckpt_prefix
=
$3
model_type
=
$4
audio_file
=
$4
audio_file
=
$5
mkdir
-p
data
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav
-P
data/
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav
-P
data/
...
@@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
...
@@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
--decode_cfg
${
decode_config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
\
--audio_file
${
audio_file
}
--audio_file
${
audio_file
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
...
...
examples/librispeech/asr0/local/train.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
2
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name
model_type
"
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
exit
-1
fi
fi
...
@@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
...
@@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
ckpt_name
=
$2
ckpt_name
=
$2
model_type
=
$3
mkdir
-p
exp
mkdir
-p
exp
...
@@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
...
@@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
--seed
${
seed
}
else
else
python3
-m
paddle.distributed.launch
--gpus
=
${
CUDA_VISIBLE_DEVICES
}
${
BIN_DIR
}
/train.py
\
python3
-m
paddle.distributed.launch
--gpus
=
${
CUDA_VISIBLE_DEVICES
}
${
BIN_DIR
}
/train.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
--seed
${
seed
}
fi
fi
...
...
examples/librispeech/asr0/run.sh
浏览文件 @
919c8d06
...
@@ -2,13 +2,12 @@
...
@@ -2,13 +2,12 @@
set
-e
set
-e
source
path.sh
source
path.sh
gpus
=
0,1,2,3
,4,5,6,7
gpus
=
0,1,2,3
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
30
avg_num
=
5
model_type
=
offline
audio_file
=
data/demo_002_en.wav
audio_file
=
data/demo_002_en.wav
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
@@ -24,7 +23,7 @@ fi
...
@@ -24,7 +23,7 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
fi
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
...
@@ -34,15 +33,20 @@ fi
...
@@ -34,15 +33,20 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit||
exit
-1
fi
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
# test a single .wav file
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
fi
fi
examples/tiny/asr0/conf/augmentation.json
已删除
100644 → 0
浏览文件 @
8b1c1ec4
[
{
"type"
:
"speed"
,
"params"
:
{
"min_speed_rate"
:
0.9
,
"max_speed_rate"
:
1.1
,
"num_rates"
:
3
},
"prob"
:
0.0
},
{
"type"
:
"shift"
,
"params"
:
{
"min_shift_ms"
:
-5
,
"max_shift_ms"
:
5
},
"prob"
:
1.0
},
{
"type"
:
"specaug"
,
"params"
:
{
"W"
:
5
,
"warp_mode"
:
"PIL"
,
"F"
:
30
,
"n_freq_masks"
:
2
,
"T"
:
40
,
"n_time_masks"
:
2
,
"p"
:
1.0
,
"adaptive_number_ratio"
:
0
,
"adaptive_size_ratio"
:
0
,
"max_n_time_masks"
:
20
,
"replace_with_zero"
:
true
},
"prob"
:
1.0
}
]
examples/tiny/asr0/conf/deepspeech2.yaml
浏览文件 @
919c8d06
...
@@ -16,28 +16,26 @@ max_output_input_ratio: 10.0
...
@@ -16,28 +16,26 @@ max_output_input_ratio: 10.0
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
mean_std_filepath
:
data/mean_std.json
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
char
spm_model_prefix
:
'
'
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
char'
augmentation_config
:
conf/augmentation.json
preprocess_config
:
conf/preprocess.yaml
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
feat_dim
:
161
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
n_fft
:
None
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq
:
None
target_sample_rate
:
16000
use_dB_normalization
:
True
target_dB
:
-20
dither
:
1.0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
batch_size
:
4
batch_size
:
4
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
...
@@ -45,8 +43,10 @@ batch_size: 4
...
@@ -45,8 +43,10 @@ batch_size: 4
num_conv_layers
:
2
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
3
rnn_layer_size
:
2048
rnn_layer_size
:
2048
rnn_direction
:
bidirect
# [forward, bidirect]
num_fc_layers
:
0
fc_layers_size_list
:
-1,
use_gru
:
False
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
0
blank_id
:
0
...
@@ -59,6 +59,7 @@ lr: 1.0e-5
...
@@ -59,6 +59,7 @@ lr: 1.0e-5
lr_decay
:
0.8
lr_decay
:
0.8
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
global_grad_clip
:
5.0
dist_sampler
:
False
log_interval
:
1
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
3
kbest_n
:
3
...
...
examples/tiny/asr0/conf/deepspeech2_online.yaml
浏览文件 @
919c8d06
...
@@ -16,29 +16,27 @@ max_output_input_ratio: 10.0
...
@@ -16,29 +16,27 @@ max_output_input_ratio: 10.0
###########################################
###########################################
# Dataloader #
# Dataloader #
###########################################
###########################################
mean_std_filepath
:
data/mean_std.json
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
char
spm_model_prefix
:
'
'
vocab_filepath
:
data/lang_char/vocab.txt
unit_type
:
'
char'
augmentation_config
:
conf/augmentation.json
preprocess_config
:
conf/preprocess.yaml
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
feat_dim
:
161
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
stride_ms
:
10.0
window_ms
:
20.0
window_ms
:
25.0
n_fft
:
None
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq
:
None
target_sample_rate
:
16000
use_dB_normalization
:
True
target_dB
:
-20
dither
:
1.0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
batch_size
:
4
batch_size
:
4
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
############################################
# Network Architecture #
# Network Architecture #
############################################
############################################
...
@@ -61,6 +59,7 @@ lr: 1.0e-5
...
@@ -61,6 +59,7 @@ lr: 1.0e-5
lr_decay
:
1.0
lr_decay
:
1.0
weight_decay
:
1.0e-6
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
global_grad_clip
:
5.0
dist_sampler
:
False
log_interval
:
1
log_interval
:
1
checkpoint
:
checkpoint
:
kbest_n
:
3
kbest_n
:
3
...
...
examples/tiny/asr0/conf/preprocess.yaml
0 → 100644
浏览文件 @
919c8d06
process
:
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
fs
:
16000
n_mels
:
161
n_shift
:
160
win_length
:
400
dither
:
0.1
-
type
:
cmvn_json
cmvn_path
:
data/mean_std.json
# these three processes are a.k.a. SpecAugument
-
type
:
time_warp
max_time_warp
:
5
inplace
:
true
mode
:
PIL
-
type
:
freq_mask
F
:
30
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
-
type
:
time_mask
T
:
40
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
examples/tiny/asr0/local/export.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path
model_type
"
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
exit
-1
exit
-1
fi
fi
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
ckpt_path_prefix
=
$2
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
jit_model_export_path
=
$3
model_type
=
$4
python3
-u
${
BIN_DIR
}
/export.py
\
python3
-u
${
BIN_DIR
}
/export.py
\
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--export_path
${
jit_model_export_path
}
\
--export_path
${
jit_model_export_path
}
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in export!"
echo
"Failed in export!"
...
...
examples/tiny/asr0/local/test.sh
浏览文件 @
919c8d06
#!/bin/bash
#!/bin/bash
if
[
$#
!=
4
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
...
@@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
...
@@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
decode_config_path
=
$2
decode_config_path
=
$2
ckpt_prefix
=
$3
ckpt_prefix
=
$3
model_type
=
$4
# download language model
# download language model
bash
local
/download_lm_en.sh
bash
local
/download_lm_en.sh
...
@@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test.py \
...
@@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test.py \
--config
${
config_path
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--checkpoint_path
${
ckpt_prefix
}
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/tiny/asr0/local/train.sh
浏览文件 @
919c8d06
...
@@ -15,14 +15,13 @@ if [ ${seed} != 0 ]; then
...
@@ -15,14 +15,13 @@ if [ ${seed} != 0 ]; then
echo
"using seed
$seed
& FLAGS_cudnn_deterministic=True ..."
echo
"using seed
$seed
& FLAGS_cudnn_deterministic=True ..."
fi
fi
if
[
$#
!=
3
]
;
then
if
[
$#
!=
2
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name
model_type
"
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
exit
-1
fi
fi
config_path
=
$1
config_path
=
$1
ckpt_name
=
$2
ckpt_name
=
$2
model_type
=
$3
mkdir
-p
exp
mkdir
-p
exp
...
@@ -31,7 +30,6 @@ python3 -u ${BIN_DIR}/train.py \
...
@@ -31,7 +30,6 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--profiler-options
"
${
profiler_options
}
"
\
--profiler-options
"
${
profiler_options
}
"
\
--seed
${
seed
}
--seed
${
seed
}
else
else
...
@@ -39,7 +37,6 @@ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/t
...
@@ -39,7 +37,6 @@ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/t
--ngpu
${
ngpu
}
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--profiler-options
"
${
profiler_options
}
"
\
--profiler-options
"
${
profiler_options
}
"
\
--seed
${
seed
}
--seed
${
seed
}
fi
fi
...
...
examples/tiny/asr0/run.sh
浏览文件 @
919c8d06
...
@@ -8,8 +8,6 @@ stop_stage=100
...
@@ -8,8 +8,6 @@ stop_stage=100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
avg_num
=
1
model_type
=
offline
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
avg_ckpt
=
avg_
${
avg_num
}
avg_ckpt
=
avg_
${
avg_num
}
...
@@ -23,7 +21,7 @@ fi
...
@@ -23,7 +21,7 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
fi
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
...
@@ -33,10 +31,10 @@ fi
...
@@ -33,10 +31,10 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
fi
paddlespeech/cli/asr/infer.py
浏览文件 @
919c8d06
...
@@ -138,6 +138,7 @@ class ASRExecutor(BaseExecutor):
...
@@ -138,6 +138,7 @@ class ASRExecutor(BaseExecutor):
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
self
.
task_resource
.
set_task_model
(
tag
,
version
=
None
)
self
.
task_resource
.
set_task_model
(
tag
,
version
=
None
)
self
.
res_path
=
self
.
task_resource
.
res_dir
self
.
res_path
=
self
.
task_resource
.
res_dir
self
.
cfg_path
=
os
.
path
.
join
(
self
.
cfg_path
=
os
.
path
.
join
(
self
.
res_path
,
self
.
task_resource
.
res_dict
[
'cfg_path'
])
self
.
res_path
,
self
.
task_resource
.
res_dict
[
'cfg_path'
])
self
.
ckpt_path
=
os
.
path
.
join
(
self
.
ckpt_path
=
os
.
path
.
join
(
...
@@ -158,15 +159,18 @@ class ASRExecutor(BaseExecutor):
...
@@ -158,15 +159,18 @@ class ASRExecutor(BaseExecutor):
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
with
UpdateConfig
(
self
.
config
):
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
if
self
.
config
.
spm_model_prefix
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
if
"deepspeech2"
in
model_type
:
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
self
.
download_lm
(
self
.
download_lm
(
...
@@ -174,12 +178,6 @@ class ASRExecutor(BaseExecutor):
...
@@ -174,12 +178,6 @@ class ASRExecutor(BaseExecutor):
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
config
.
decode
.
decoding_method
=
decode_method
self
.
config
.
decode
.
decoding_method
=
decode_method
else
:
else
:
...
@@ -222,19 +220,7 @@ class ASRExecutor(BaseExecutor):
...
@@ -222,19 +220,7 @@ class ASRExecutor(BaseExecutor):
logger
.
info
(
"Preprocess audio_file:"
+
audio_file
)
logger
.
info
(
"Preprocess audio_file:"
+
audio_file
)
# Get the object for feature extraction
# Get the object for feature extraction
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
if
"deepspeech2"
in
model_type
or
"conformer"
in
model_type
or
"transformer"
in
model_type
:
audio
,
_
=
self
.
collate_fn_test
.
process_utterance
(
audio_file
=
audio_file
,
transcript
=
" "
)
audio_len
=
audio
.
shape
[
0
]
audio
=
paddle
.
to_tensor
(
audio
,
dtype
=
'float32'
)
audio_len
=
paddle
.
to_tensor
(
audio_len
)
audio
=
paddle
.
unsqueeze
(
audio
,
axis
=
0
)
# vocab_list = collate_fn_test.vocab_list
self
.
_inputs
[
"audio"
]
=
audio
self
.
_inputs
[
"audio_len"
]
=
audio_len
logger
.
info
(
f
"audio feat shape:
{
audio
.
shape
}
"
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
logger
.
info
(
"get the preprocess conf"
)
logger
.
info
(
"get the preprocess conf"
)
preprocess_conf
=
self
.
config
.
preprocess_config
preprocess_conf
=
self
.
config
.
preprocess_config
preprocess_args
=
{
"train"
:
False
}
preprocess_args
=
{
"train"
:
False
}
...
@@ -242,7 +228,6 @@ class ASRExecutor(BaseExecutor):
...
@@ -242,7 +228,6 @@ class ASRExecutor(BaseExecutor):
logger
.
info
(
"read the audio file"
)
logger
.
info
(
"read the audio file"
)
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
if
self
.
change_format
:
if
self
.
change_format
:
if
audio
.
shape
[
1
]
>=
2
:
if
audio
.
shape
[
1
]
>=
2
:
audio
=
audio
.
mean
(
axis
=
1
,
dtype
=
np
.
int16
)
audio
=
audio
.
mean
(
axis
=
1
,
dtype
=
np
.
int16
)
...
@@ -285,7 +270,7 @@ class ASRExecutor(BaseExecutor):
...
@@ -285,7 +270,7 @@ class ASRExecutor(BaseExecutor):
cfg
=
self
.
config
.
decode
cfg
=
self
.
config
.
decode
audio
=
self
.
_inputs
[
"audio"
]
audio
=
self
.
_inputs
[
"audio"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
if
"deepspeech2
online"
in
model_type
or
"deepspeech2offline
"
in
model_type
:
if
"deepspeech2"
in
model_type
:
decode_batch_size
=
audio
.
shape
[
0
]
decode_batch_size
=
audio
.
shape
[
0
]
self
.
model
.
decoder
.
init_decoder
(
self
.
model
.
decoder
.
init_decoder
(
decode_batch_size
,
self
.
text_feature
.
vocab_list
,
decode_batch_size
,
self
.
text_feature
.
vocab_list
,
...
...
paddlespeech/resource/model_alias.py
浏览文件 @
919c8d06
...
@@ -23,7 +23,7 @@ model_alias = {
...
@@ -23,7 +23,7 @@ model_alias = {
# ---------------------------------
# ---------------------------------
"deepspeech2offline"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"deepspeech2offline"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"deepspeech2online"
:
"deepspeech2online"
:
[
"paddlespeech.s2t.models.ds2
_online:DeepSpeech2ModelOnline
"
],
[
"paddlespeech.s2t.models.ds2
:DeepSpeech2Model
"
],
"conformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer_online"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer_online"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"transformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"transformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
...
...
paddlespeech/resource/pretrained_models.py
浏览文件 @
919c8d06
...
@@ -136,9 +136,9 @@ asr_dynamic_pretrained_models = {
...
@@ -136,9 +136,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_wenetspeech-zh-16k"
:
{
"deepspeech2online_wenetspeech-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.
0a
.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.
1
.model.tar.gz'
,
'md5'
:
'md5'
:
'
e393d4d274af0f6967db24fc146e8074
'
,
'
d1be86a3e786042ab64f05161b5fae62
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
...
@@ -152,13 +152,13 @@ asr_dynamic_pretrained_models = {
...
@@ -152,13 +152,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_aishell-zh-16k"
:
{
"deepspeech2offline_aishell-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
aishell_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
offline_aishell_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
932c3593d62fe5c741b59b31318aa314
'
,
'
4d26066c6f19f52087425dc722ae5b13
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_1'
,
'exp/deepspeech2/checkpoints/avg_1
0
'
,
'lm_url'
:
'lm_url'
:
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'lm_md5'
:
'lm_md5'
:
...
@@ -168,9 +168,9 @@ asr_dynamic_pretrained_models = {
...
@@ -168,9 +168,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_aishell-zh-16k"
:
{
"deepspeech2online_aishell-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_
0.2
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_
1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
98b87b171b7240b7cae6e07d8d0bc9be
'
,
'
df5ddeac8b679a470176649ac4b78726
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
...
@@ -188,13 +188,13 @@ asr_dynamic_pretrained_models = {
...
@@ -188,13 +188,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_librispeech-en-16k"
:
{
"deepspeech2offline_librispeech-en-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_
librispeech_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_
offline_librispeech_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
f5666c81ad015c8de03aac2bc92e5762
'
,
'
ed9e2b008a65268b3484020281ab048c
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_
1
'
,
'exp/deepspeech2/checkpoints/avg_
5
'
,
'lm_url'
:
'lm_url'
:
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm'
,
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm'
,
'lm_md5'
:
'lm_md5'
:
...
@@ -207,17 +207,17 @@ asr_static_pretrained_models = {
...
@@ -207,17 +207,17 @@ asr_static_pretrained_models = {
"deepspeech2offline_aishell-zh-16k"
:
{
"deepspeech2offline_aishell-zh-16k"
:
{
'1.0'
:
{
'1.0'
:
{
'url'
:
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
aishell_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
offline_aishell_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'md5'
:
'
932c3593d62fe5c741b59b31318aa314
'
,
'
4d26066c6f19f52087425dc722ae5b13
'
,
'cfg_path'
:
'cfg_path'
:
'model.yaml'
,
'model.yaml'
,
'ckpt_path'
:
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_1'
,
'exp/deepspeech2/checkpoints/avg_1
0
'
,
'model'
:
'model'
:
'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel'
,
'exp/deepspeech2/checkpoints/avg_1
0
.jit.pdmodel'
,
'params'
:
'params'
:
'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams'
,
'exp/deepspeech2/checkpoints/avg_1
0
.jit.pdiparams'
,
'lm_url'
:
'lm_url'
:
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'lm_md5'
:
'lm_md5'
:
...
@@ -830,7 +830,7 @@ vector_dynamic_pretrained_models = {
...
@@ -830,7 +830,7 @@ vector_dynamic_pretrained_models = {
'cfg_path'
:
'cfg_path'
:
'conf/model.yaml'
,
# the yaml config path
'conf/model.yaml'
,
# the yaml config path
'ckpt_path'
:
'ckpt_path'
:
'model/model'
,
# the format is ${dir}/{model_name},
'model/model'
,
# the format is ${dir}/{model_name},
# so the first 'model' is dir, the second 'model' is the name
# so the first 'model' is dir, the second 'model' is the name
# this means we have a model stored as model/model.pdparams
# this means we have a model stored as model/model.pdparams
},
},
...
...
paddlespeech/s2t/exps/deepspeech2/bin/export.py
浏览文件 @
919c8d06
...
@@ -32,11 +32,9 @@ def main(config, args):
...
@@ -32,11 +32,9 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
# save jit model to
# save jit model to
parser
.
add_argument
(
parser
.
add_argument
(
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
"offline/online"
)
parser
.
add_argument
(
parser
.
add_argument
(
'--nxpu'
,
'--nxpu'
,
type
=
int
,
type
=
int
,
...
@@ -44,7 +42,6 @@ if __name__ == "__main__":
...
@@ -44,7 +42,6 @@ if __name__ == "__main__":
choices
=
[
0
,
1
],
choices
=
[
0
,
1
],
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
print_arguments
(
args
)
print_arguments
(
args
)
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test.py
浏览文件 @
919c8d06
...
@@ -32,9 +32,7 @@ def main(config, args):
...
@@ -32,9 +32,7 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
parser
.
add_argument
(
# save asr result to
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
# save asr result to
parser
.
add_argument
(
parser
.
add_argument
(
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -45,7 +43,6 @@ if __name__ == "__main__":
...
@@ -45,7 +43,6 @@ if __name__ == "__main__":
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
浏览文件 @
919c8d06
...
@@ -38,8 +38,6 @@ if __name__ == "__main__":
...
@@ -38,8 +38,6 @@ if __name__ == "__main__":
#load jit model from
#load jit model from
parser
.
add_argument
(
parser
.
add_argument
(
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--nxpu'
,
'--nxpu'
,
type
=
int
,
type
=
int
,
...
@@ -50,7 +48,6 @@ if __name__ == "__main__":
...
@@ -50,7 +48,6 @@ if __name__ == "__main__":
"--enable-auto-log"
,
action
=
"store_true"
,
help
=
"use auto log"
)
"--enable-auto-log"
,
action
=
"store_true"
,
help
=
"use auto log"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
浏览文件 @
919c8d06
...
@@ -23,7 +23,6 @@ from yacs.config import CfgNode
...
@@ -23,7 +23,6 @@ from yacs.config import CfgNode
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.utils
import
mp_tools
from
paddlespeech.s2t.utils
import
mp_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
...
@@ -113,12 +112,7 @@ class DeepSpeech2Tester_hub():
...
@@ -113,12 +112,7 @@ class DeepSpeech2Tester_hub():
config
.
input_dim
=
self
.
collate_fn_test
.
feature_size
config
.
input_dim
=
self
.
collate_fn_test
.
feature_size
config
.
output_dim
=
self
.
collate_fn_test
.
vocab_size
config
.
output_dim
=
self
.
collate_fn_test
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
raise
Exception
(
"wrong model type"
)
self
.
model
=
model
self
.
model
=
model
...
@@ -172,8 +166,6 @@ def main(config, args):
...
@@ -172,8 +166,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
"--audio_file"
,
type
=
str
,
help
=
'audio file path'
)
parser
.
add_argument
(
"--audio_file"
,
type
=
str
,
help
=
'audio file path'
)
# save asr result to
# save asr result to
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -184,7 +176,6 @@ if __name__ == "__main__":
...
@@ -184,7 +176,6 @@ if __name__ == "__main__":
print
(
"Please input the audio file path"
)
print
(
"Please input the audio file path"
)
sys
.
exit
(
-
1
)
sys
.
exit
(
-
1
)
check
(
args
.
audio_file
)
check
(
args
.
audio_file
)
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/train.py
浏览文件 @
919c8d06
...
@@ -31,8 +31,6 @@ def main(config, args):
...
@@ -31,8 +31,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--nxpu'
,
'--nxpu'
,
type
=
int
,
type
=
int
,
...
@@ -40,7 +38,6 @@ if __name__ == "__main__":
...
@@ -40,7 +38,6 @@ if __name__ == "__main__":
choices
=
[
0
,
1
],
choices
=
[
0
,
1
],
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
...
...
paddlespeech/s2t/exps/deepspeech2/model.py
浏览文件 @
919c8d06
...
@@ -23,16 +23,12 @@ import paddle
...
@@ -23,16 +23,12 @@ import paddle
from
paddle
import
distributed
as
dist
from
paddle
import
distributed
as
dist
from
paddle
import
inference
from
paddle
import
inference
from
paddle.io
import
DataLoader
from
paddle.io
import
DataLoader
from
paddlespeech.s2t.io.dataloader
import
BatchDataLoader
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.io.dataset
import
ManifestDataset
from
paddlespeech.s2t.io.dataset
import
ManifestDataset
from
paddlespeech.s2t.io.sampler
import
SortagradBatchSampler
from
paddlespeech.s2t.io.sampler
import
SortagradDistributedBatchSampler
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2InferModel
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2InferModel
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2InferModelOnline
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.training.gradclip
import
ClipGradByGlobalNormWithLog
from
paddlespeech.s2t.training.gradclip
import
ClipGradByGlobalNormWithLog
from
paddlespeech.s2t.training.reporter
import
report
from
paddlespeech.s2t.training.reporter
import
report
from
paddlespeech.s2t.training.timer
import
Timer
from
paddlespeech.s2t.training.timer
import
Timer
...
@@ -136,18 +132,13 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -136,18 +132,13 @@ class DeepSpeech2Trainer(Trainer):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
with
UpdateConfig
(
config
):
with
UpdateConfig
(
config
):
if
self
.
train
:
if
self
.
train
:
config
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
config
.
input_dim
=
self
.
train_loader
.
feat_dim
config
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
config
.
output_dim
=
self
.
train_loader
.
vocab_size
else
:
else
:
config
.
input_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
config
.
input_dim
=
self
.
test_loader
.
feat_dim
config
.
output_dim
=
self
.
test_loader
.
collate_fn
.
vocab_size
config
.
output_dim
=
self
.
test_loader
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
raise
Exception
(
"wrong model type"
)
if
self
.
parallel
:
if
self
.
parallel
:
model
=
paddle
.
DataParallel
(
model
)
model
=
paddle
.
DataParallel
(
model
)
...
@@ -175,76 +166,81 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -175,76 +166,81 @@ class DeepSpeech2Trainer(Trainer):
config
=
self
.
config
.
clone
()
config
=
self
.
config
.
clone
()
config
.
defrost
()
config
.
defrost
()
if
self
.
train
:
if
self
.
train
:
# train
# train/valid dataset, return token ids
config
.
manifest
=
config
.
train_manifest
self
.
train_loader
=
BatchDataLoader
(
train_dataset
=
ManifestDataset
.
from_config
(
config
)
json_file
=
config
.
train_manifest
,
if
self
.
parallel
:
train_mode
=
True
,
batch_sampler
=
SortagradDistributedBatchSampler
(
sortagrad
=
config
.
sortagrad
,
train_dataset
,
batch_size
=
config
.
batch_size
,
batch_size
=
config
.
batch_size
,
maxlen_in
=
config
.
maxlen_in
,
num_replicas
=
None
,
maxlen_out
=
config
.
maxlen_out
,
rank
=
None
,
minibatches
=
config
.
minibatches
,
shuffle
=
True
,
mini_batch_size
=
self
.
args
.
ngpu
,
drop_last
=
True
,
batch_count
=
config
.
batch_count
,
sortagrad
=
config
.
sortagrad
,
batch_bins
=
config
.
batch_bins
,
shuffle_method
=
config
.
shuffle_method
)
batch_frames_in
=
config
.
batch_frames_in
,
else
:
batch_frames_out
=
config
.
batch_frames_out
,
batch_sampler
=
SortagradBatchSampler
(
batch_frames_inout
=
config
.
batch_frames_inout
,
train_dataset
,
preprocess_conf
=
config
.
preprocess_config
,
shuffle
=
True
,
n_iter_processes
=
config
.
num_workers
,
batch_size
=
config
.
batch_size
,
subsampling_factor
=
1
,
drop_last
=
True
,
num_encs
=
1
,
sortagrad
=
config
.
sortagrad
,
dist_sampler
=
config
.
get
(
'dist_sampler'
,
False
),
shuffle_method
=
config
.
shuffle_method
)
shortest_first
=
False
)
config
.
keep_transcription_text
=
False
self
.
valid_loader
=
BatchDataLoader
(
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
json_file
=
config
.
dev_manifest
,
self
.
train_loader
=
DataLoader
(
train_mode
=
False
,
train_dataset
,
sortagrad
=
False
,
batch_sampler
=
batch_sampler
,
batch_size
=
config
.
batch_size
,
collate_fn
=
collate_fn_train
,
maxlen_in
=
float
(
'inf'
),
num_workers
=
config
.
num_workers
)
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
# dev
mini_batch_size
=
self
.
args
.
ngpu
,
config
.
manifest
=
config
.
dev_manifest
batch_count
=
'auto'
,
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
batch_bins
=
0
,
batch_frames_in
=
0
,
config
.
augmentation_config
=
""
batch_frames_out
=
0
,
config
.
keep_transcription_text
=
False
batch_frames_inout
=
0
,
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
preprocess_conf
=
config
.
preprocess_config
,
self
.
valid_loader
=
DataLoader
(
n_iter_processes
=
config
.
num_workers
,
dev_dataset
,
subsampling_factor
=
1
,
batch_size
=
int
(
config
.
batch_size
),
num_encs
=
1
,
shuffle
=
False
,
dist_sampler
=
config
.
get
(
'dist_sampler'
,
False
),
drop_last
=
False
,
shortest_first
=
False
)
collate_fn
=
collate_fn_dev
,
logger
.
info
(
"Setup train/valid Dataloader!"
)
num_workers
=
config
.
num_workers
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
else
:
else
:
# test
config
.
manifest
=
config
.
test_manifest
test_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
augmentation_config
=
""
config
.
keep_transcription_text
=
True
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
'decode_batch_size'
,
1
)
'decode_batch_size'
,
1
)
self
.
test_loader
=
DataLoader
(
# test dataset, return raw text
test_dataset
,
self
.
test_loader
=
BatchDataLoader
(
json_file
=
config
.
test_manifest
,
train_mode
=
False
,
sortagrad
=
False
,
batch_size
=
decode_batch_size
,
batch_size
=
decode_batch_size
,
shuffle
=
False
,
maxlen_in
=
float
(
'inf'
),
drop_last
=
False
,
maxlen_out
=
float
(
'inf'
),
collate_fn
=
collate_fn_test
,
minibatches
=
0
,
num_workers
=
config
.
num_workers
)
mini_batch_size
=
1
,
logger
.
info
(
"Setup test Dataloader!"
)
batch_count
=
'auto'
,
batch_bins
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
preprocess_config
,
n_iter_processes
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
logger
.
info
(
"Setup test/align Dataloader!"
)
class
DeepSpeech2Tester
(
DeepSpeech2Trainer
):
class
DeepSpeech2Tester
(
DeepSpeech2Trainer
):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
self
.
_text_featurizer
=
TextFeaturizer
(
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
unit_type
,
vocab
=
None
)
unit_type
=
config
.
unit_type
,
vocab
=
config
.
vocab_filepath
)
self
.
vocab_list
=
self
.
_text_featurizer
.
vocab_list
def
ordid2token
(
self
,
texts
,
texts_len
):
def
ordid2token
(
self
,
texts
,
texts_len
):
""" ord() id to chr() chr """
""" ord() id to chr() chr """
...
@@ -252,7 +248,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -252,7 +248,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
for
text
,
n
in
zip
(
texts
,
texts_len
):
for
text
,
n
in
zip
(
texts
,
texts_len
):
n
=
n
.
numpy
().
item
()
n
=
n
.
numpy
().
item
()
ids
=
text
[:
n
]
ids
=
text
[:
n
]
trans
.
append
(
''
.
join
([
chr
(
i
)
for
i
in
ids
]
))
trans
.
append
(
self
.
_text_featurizer
.
defeaturize
(
ids
.
numpy
().
tolist
()
))
return
trans
return
trans
def
compute_metrics
(
self
,
def
compute_metrics
(
self
,
...
@@ -307,8 +303,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -307,8 +303,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
# Initialized the decoder in model
# Initialized the decoder in model
decode_cfg
=
self
.
config
.
decode
decode_cfg
=
self
.
config
.
decode
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
vocab_list
=
self
.
vocab_list
decode_batch_size
=
self
.
test_loader
.
batch_size
decode_batch_size
=
decode_cfg
.
decode_
batch_size
self
.
model
.
decoder
.
init_decoder
(
self
.
model
.
decoder
.
init_decoder
(
decode_batch_size
,
vocab_list
,
decode_cfg
.
decoding_method
,
decode_batch_size
,
vocab_list
,
decode_cfg
.
decoding_method
,
decode_cfg
.
lang_model_path
,
decode_cfg
.
alpha
,
decode_cfg
.
beta
,
decode_cfg
.
lang_model_path
,
decode_cfg
.
alpha
,
decode_cfg
.
beta
,
...
@@ -338,17 +334,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -338,17 +334,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
export
(
self
):
def
export
(
self
):
if
self
.
args
.
model_type
==
'offline'
:
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
elif
self
.
args
.
model_type
==
'online'
:
infer_model
=
DeepSpeech2InferModelOnline
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
else
:
raise
Exception
(
"wrong model type"
)
infer_model
.
eval
()
infer_model
.
eval
()
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
static_model
=
infer_model
.
export
()
static_model
=
infer_model
.
export
()
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
...
@@ -376,10 +364,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -376,10 +364,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
# Initialized the decoder in model
# Initialized the decoder in model
decode_cfg
=
self
.
config
.
decode
decode_cfg
=
self
.
config
.
decode
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
vocab_list
=
self
.
vocab_list
if
self
.
args
.
model_type
==
"online
"
:
if
self
.
config
.
rnn_direction
==
"forward
"
:
decode_batch_size
=
1
decode_batch_size
=
1
elif
self
.
args
.
model_type
==
"offline
"
:
elif
self
.
config
.
rnn_direction
==
"bidirect
"
:
decode_batch_size
=
self
.
test_loader
.
batch_size
decode_batch_size
=
self
.
test_loader
.
batch_size
else
:
else
:
raise
Exception
(
"wrong model type"
)
raise
Exception
(
"wrong model type"
)
...
@@ -412,11 +400,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
...
@@ -412,11 +400,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self
.
model
.
decoder
.
del_decoder
()
self
.
model
.
decoder
.
del_decoder
()
def
compute_result_transcripts
(
self
,
audio
,
audio_len
):
def
compute_result_transcripts
(
self
,
audio
,
audio_len
):
if
self
.
args
.
model_type
==
"online
"
:
if
self
.
config
.
rnn_direction
==
"forward
"
:
output_probs
,
output_lens
,
trans_batch
=
self
.
static_forward_online
(
output_probs
,
output_lens
,
trans_batch
=
self
.
static_forward_online
(
audio
,
audio_len
,
decoder_chunk_size
=
1
)
audio
,
audio_len
,
decoder_chunk_size
=
1
)
result_transcripts
=
[
trans
[
-
1
]
for
trans
in
trans_batch
]
result_transcripts
=
[
trans
[
-
1
]
for
trans
in
trans_batch
]
elif
self
.
args
.
model_type
==
"offline
"
:
elif
self
.
config
.
rnn_direction
==
"bidirect
"
:
output_probs
,
output_lens
=
self
.
static_forward_offline
(
audio
,
output_probs
,
output_lens
=
self
.
static_forward_offline
(
audio
,
audio_len
)
audio_len
)
batch_size
=
output_probs
.
shape
[
0
]
batch_size
=
output_probs
.
shape
[
0
]
...
...
paddlespeech/s2t/models/ds2/conv.py
浏览文件 @
919c8d06
...
@@ -11,161 +11,23 @@
...
@@ -11,161 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
paddle
import
nn
import
paddle
from
paddle.nn
import
functional
as
F
from
paddlespeech.s2t.modules.activation
import
brelu
from
paddlespeech.s2t.modules.subsampling
import
Conv2dSubsampling4
from
paddlespeech.s2t.modules.mask
import
make_non_pad_mask
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'ConvStack'
,
"conv_output_size"
]
class
Conv2dSubsampling4Pure
(
Conv2dSubsampling4
):
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
):
super
().
__init__
(
idim
,
odim
,
dropout_rate
,
None
)
self
.
output_dim
=
((
idim
-
1
)
//
2
-
1
)
//
2
*
odim
self
.
receptive_field_length
=
2
*
(
3
-
1
)
+
3
# stride_1 * (kernel_size_2 - 1) + kerel_size_1
def
forward
(
self
,
x
:
paddle
.
Tensor
,
def
conv_output_size
(
I
,
F
,
P
,
S
):
x_len
:
paddle
.
Tensor
)
->
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
# Output size after Conv:
# By noting I the length of the input volume size,
# F the length of the filter,
# P the amount of zero padding,
# S the stride,
# then the output size O of the feature map along that dimension is given by:
# O = (I - F + Pstart + Pend) // S + 1
# When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
# When Pstart == Pend == 0
# O = (I - F - S) // S
# https://iq.opengenus.org/output-size-of-convolution/
# Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
# Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
return
(
I
-
F
+
2
*
P
-
S
)
//
S
# receptive field calculator
# https://fomoro.com/research/article/receptive-field-calculator
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://distill.pub/2019/computing-receptive-fields/
# Rl-1 = Sl * Rl + (Kl - Sl)
class
ConvBn
(
nn
.
Layer
):
"""Convolution layer with batch normalization.
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type kernel_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:param num_channels_out: Number of output channels.
:type num_channels_out: int
:param stride: The x dimension of the stride. Or input a tuple for two
image dimension.
:type stride: int|tuple|list
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type, relu|brelu
:type act: string
:return: Batch norm layer after convolution layer.
:rtype: Variable
"""
def
__init__
(
self
,
num_channels_in
,
num_channels_out
,
kernel_size
,
stride
,
padding
,
act
):
super
().
__init__
()
assert
len
(
kernel_size
)
==
2
assert
len
(
stride
)
==
2
assert
len
(
padding
)
==
2
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
padding
=
padding
self
.
conv
=
nn
.
Conv2D
(
num_channels_in
,
num_channels_out
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
weight_attr
=
None
,
bias_attr
=
False
,
data_format
=
'NCHW'
)
self
.
bn
=
nn
.
BatchNorm2D
(
num_channels_out
,
weight_attr
=
None
,
bias_attr
=
None
,
data_format
=
'NCHW'
)
self
.
act
=
F
.
relu
if
act
==
'relu'
else
brelu
def
forward
(
self
,
x
,
x_len
):
"""
x(Tensor): audio, shape [B, C, D, T]
"""
x
=
self
.
conv
(
x
)
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
#b, c, t, f = paddle.shape(x) #not work under jit
x
=
self
.
act
(
x
)
x
=
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
0
,
0
,
-
1
])
x_len
=
((
x_len
-
1
)
//
2
-
1
)
//
2
x_len
=
(
x_len
-
self
.
kernel_size
[
1
]
+
2
*
self
.
padding
[
1
]
)
//
self
.
stride
[
1
]
+
1
# reset padding part to 0
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
1
).
unsqueeze
(
1
)
# [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
class
ConvStack
(
nn
.
Layer
):
"""Convolution group with stacked convolution layers.
:param feat_size: audio feature dim.
:type feat_size: int
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
"""
def
__init__
(
self
,
feat_size
,
num_stacks
):
super
().
__init__
()
self
.
feat_size
=
feat_size
# D
self
.
num_stacks
=
num_stacks
self
.
conv_in
=
ConvBn
(
num_channels_in
=
1
,
num_channels_out
=
32
,
kernel_size
=
(
41
,
11
),
#[D, T]
stride
=
(
2
,
3
),
padding
=
(
20
,
5
),
act
=
'brelu'
)
out_channel
=
32
convs
=
[
ConvBn
(
num_channels_in
=
32
,
num_channels_out
=
out_channel
,
kernel_size
=
(
21
,
11
),
stride
=
(
2
,
1
),
padding
=
(
10
,
5
),
act
=
'brelu'
)
for
i
in
range
(
num_stacks
-
1
)
]
self
.
conv_stack
=
nn
.
LayerList
(
convs
)
# conv output feat_dim
output_height
=
(
feat_size
-
1
)
//
2
+
1
for
i
in
range
(
self
.
num_stacks
-
1
):
output_height
=
(
output_height
-
1
)
//
2
+
1
self
.
output_height
=
out_channel
*
output_height
def
forward
(
self
,
x
,
x_len
):
"""
x: shape [B, C, D, T]
x_len : shape [B]
"""
x
,
x_len
=
self
.
conv_in
(
x
,
x_len
)
for
i
,
conv
in
enumerate
(
self
.
conv_stack
):
x
,
x_len
=
conv
(
x
,
x_len
)
return
x
,
x_len
return
x
,
x_len
paddlespeech/s2t/models/ds2/deepspeech2.py
浏览文件 @
919c8d06
...
@@ -13,15 +13,14 @@
...
@@ -13,15 +13,14 @@
# limitations under the License.
# limitations under the License.
"""Deepspeech2 ASR Model"""
"""Deepspeech2 ASR Model"""
import
paddle
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddle
import
nn
from
paddlespeech.s2t.models.ds2.conv
import
ConvStack
from
paddlespeech.s2t.models.ds2.conv
import
Conv2dSubsampling4Pure
from
paddlespeech.s2t.models.ds2.rnn
import
RNNStack
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'DeepSpeech2Model'
,
'DeepSpeech2InferModel'
]
__all__
=
[
'DeepSpeech2Model'
,
'DeepSpeech2InferModel'
]
...
@@ -32,72 +31,197 @@ class CRNNEncoder(nn.Layer):
...
@@ -32,72 +31,197 @@ class CRNNEncoder(nn.Layer):
feat_size
,
feat_size
,
dict_size
,
dict_size
,
num_conv_layers
=
2
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_size
=
1024
,
use_gru
=
False
,
rnn_direction
=
'forward'
,
share_rnn_weights
=
True
):
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
):
super
().
__init__
()
super
().
__init__
()
self
.
rnn_size
=
rnn_size
self
.
rnn_size
=
rnn_size
self
.
feat_size
=
feat_size
# 161 for linear
self
.
feat_size
=
feat_size
# 161 for linear
self
.
dict_size
=
dict_size
self
.
dict_size
=
dict_size
self
.
num_rnn_layers
=
num_rnn_layers
self
.
conv
=
ConvStack
(
feat_size
,
num_conv_layers
)
self
.
num_fc_layers
=
num_fc_layers
self
.
rnn_direction
=
rnn_direction
i_size
=
self
.
conv
.
output_height
# H after conv stack
self
.
fc_layers_size_list
=
fc_layers_size_list
self
.
rnn
=
RNNStack
(
self
.
use_gru
=
use_gru
i_size
=
i_size
,
self
.
conv
=
Conv2dSubsampling4Pure
(
feat_size
,
32
,
dropout_rate
=
0.0
)
h_size
=
rnn_size
,
num_stacks
=
num_rnn_layers
,
self
.
output_dim
=
self
.
conv
.
output_dim
use_gru
=
use_gru
,
share_rnn_weights
=
share_rnn_weights
)
i_size
=
self
.
conv
.
output_dim
self
.
rnn
=
nn
.
LayerList
()
self
.
layernorm_list
=
nn
.
LayerList
()
self
.
fc_layers_list
=
nn
.
LayerList
()
if
rnn_direction
==
'bidirect'
or
rnn_direction
==
'bidirectional'
:
layernorm_size
=
2
*
rnn_size
elif
rnn_direction
==
'forward'
:
layernorm_size
=
rnn_size
else
:
raise
Exception
(
"Wrong rnn direction"
)
for
i
in
range
(
0
,
num_rnn_layers
):
if
i
==
0
:
rnn_input_size
=
i_size
else
:
rnn_input_size
=
layernorm_size
if
use_gru
is
True
:
self
.
rnn
.
append
(
nn
.
GRU
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
else
:
self
.
rnn
.
append
(
nn
.
LSTM
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
self
.
output_dim
=
layernorm_size
fc_input_size
=
layernorm_size
for
i
in
range
(
self
.
num_fc_layers
):
self
.
fc_layers_list
.
append
(
nn
.
Linear
(
fc_input_size
,
fc_layers_size_list
[
i
]))
fc_input_size
=
fc_layers_size_list
[
i
]
self
.
output_dim
=
fc_layers_size_list
[
i
]
@
property
@
property
def
output_size
(
self
):
def
output_size
(
self
):
return
self
.
rnn_size
*
2
return
self
.
output_dim
def
forward
(
self
,
audio
,
audio_len
):
def
forward
(
self
,
x
,
x_lens
,
init_state_h_box
=
None
,
init_state_c_box
=
None
):
"""Compute Encoder outputs
"""Compute Encoder outputs
Args:
Args:
audio (Tensor): [B, Tmax
, D]
x (Tensor): [B, T
, D]
text (Tensor): [B, Umax
]
x_lens (Tensor): [B
]
audio_len (Tensor): [B
]
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size
]
text_len (Tensor): [B
]
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size
]
Return
s
:
Return:
x (Tensor): encoder outputs, [B, T, D]
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
"""
# [B, T, D] -> [B, D, T]
if
init_state_h_box
is
not
None
:
audio
=
audio
.
transpose
([
0
,
2
,
1
])
init_state_list
=
None
# [B, D, T] -> [B, C=1, D, T]
x
=
audio
.
unsqueeze
(
1
)
if
self
.
use_gru
is
True
:
x_lens
=
audio_len
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
init_state_h_list
else
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
[(
init_state_h_list
[
i
],
init_state_c_list
[
i
])
for
i
in
range
(
self
.
num_rnn_layers
)]
else
:
init_state_list
=
[
None
]
*
self
.
num_rnn_layers
# convolution group
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
final_chunk_state_list
=
[]
for
i
in
range
(
0
,
self
.
num_rnn_layers
):
x
,
final_state
=
self
.
rnn
[
i
](
x
,
init_state_list
[
i
],
x_lens
)
#[B, T, D]
final_chunk_state_list
.
append
(
final_state
)
x
=
self
.
layernorm_list
[
i
](
x
)
for
i
in
range
(
self
.
num_fc_layers
):
x
=
self
.
fc_layers_list
[
i
](
x
)
x
=
F
.
relu
(
x
)
if
self
.
use_gru
is
True
:
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_list
,
axis
=
0
)
final_chunk_state_c_box
=
init_state_c_box
else
:
final_chunk_state_h_list
=
[
final_chunk_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_c_list
=
[
final_chunk_state_list
[
i
][
1
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_h_list
,
axis
=
0
)
final_chunk_state_c_box
=
paddle
.
concat
(
final_chunk_state_c_list
,
axis
=
0
)
return
x
,
x_lens
,
final_chunk_state_h_box
,
final_chunk_state_c_box
def
forward_chunk_by_chunk
(
self
,
x
,
x_lens
,
decoder_chunk_size
=
8
):
"""Compute Encoder outputs
# convert data from convolution feature map to sequence of vectors
Args:
#B, C, D, T = paddle.shape(x) # not work under jit
x (Tensor): [B, T, D]
x
=
x
.
transpose
([
0
,
3
,
1
,
2
])
#[B, T, C, D]
x_lens (Tensor): [B]
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
decoder_chunk_size: The chunk size of decoder
x
=
x
.
reshape
([
0
,
0
,
-
1
])
#[B, T, C*D]
Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
# remove padding part
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
x
,
x_lens
=
self
.
rnn
(
x
,
x_lens
)
#[B, T, D]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
return
x
,
x_lens
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate
=
self
.
conv
.
subsampling_rate
receptive_field_length
=
self
.
conv
.
receptive_field_length
chunk_size
=
(
decoder_chunk_size
-
1
)
*
subsampling_rate
+
receptive_field_length
chunk_stride
=
subsampling_rate
*
decoder_chunk_size
max_len
=
x
.
shape
[
1
]
assert
(
chunk_size
<=
max_len
)
eouts_chunk_list
=
[]
eouts_chunk_lens_list
=
[]
if
(
max_len
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len
=
chunk_stride
-
(
max_len
-
chunk_size
)
%
chunk_stride
else
:
padding_len
=
0
padding
=
paddle
.
zeros
((
x
.
shape
[
0
],
padding_len
,
x
.
shape
[
2
]))
padded_x
=
paddle
.
concat
([
x
,
padding
],
axis
=
1
)
num_chunk
=
(
max_len
+
padding_len
-
chunk_size
)
/
chunk_stride
+
1
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
None
chunk_state_c_box
=
None
final_state_h_box
=
None
final_state_c_box
=
None
for
i
in
range
(
0
,
num_chunk
):
start
=
i
*
chunk_stride
end
=
start
+
chunk_size
x_chunk
=
padded_x
[:,
start
:
end
,
:]
x_len_left
=
paddle
.
where
(
x_lens
-
i
*
chunk_stride
<
0
,
paddle
.
zeros_like
(
x_lens
),
x_lens
-
i
*
chunk_stride
)
x_chunk_len_tmp
=
paddle
.
ones_like
(
x_lens
)
*
chunk_size
x_chunk_lens
=
paddle
.
where
(
x_len_left
<
x_chunk_len_tmp
,
x_len_left
,
x_chunk_len_tmp
)
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
(
x_chunk
,
x_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
eouts_chunk_list
.
append
(
eouts_chunk
)
eouts_chunk_lens_list
.
append
(
eouts_chunk_lens
)
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
return
eouts_chunk_list
,
eouts_chunk_lens_list
,
final_state_h_box
,
final_state_c_box
class
DeepSpeech2Model
(
nn
.
Layer
):
class
DeepSpeech2Model
(
nn
.
Layer
):
"""The DeepSpeech2 network structure.
"""The DeepSpeech2 network structure.
:param audio
_data
: Audio spectrogram data layer.
:param audio: Audio spectrogram data layer.
:type audio
_data
: Variable
:type audio: Variable
:param text
_data
: Transcription text data layer.
:param text: Transcription text data layer.
:type text
_data
: Variable
:type text: Variable
:param audio_len: Valid sequence length data layer.
:param audio_len: Valid sequence length data layer.
:type audio_len: Variable
:type audio_len: Variable
:param
masks: Masks data layer to reset padding
.
:param
feat_size: feature size for audio
.
:type
masks: Variable
:type
feat_size: int
:param dict_size: Dictionary size for tokenized transcription.
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:param num_conv_layers: Number of stacking convolution layers.
...
@@ -106,37 +230,41 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -106,37 +230,41 @@ class DeepSpeech2Model(nn.Layer):
:type num_rnn_layers: int
:type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells).
:param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int
:type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False.
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward direction RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: A tuple of an output unnormalized log probability layer (
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
:rtype: tuple of LayerOutput
"""
"""
def
__init__
(
self
,
def
__init__
(
feat_size
,
self
,
dict_size
,
feat_size
,
num_conv_layers
=
2
,
dict_size
,
num_rnn_layers
=
3
,
num_conv_layers
=
2
,
rnn_size
=
1024
,
num_rnn_layers
=
4
,
use_gru
=
False
,
rnn_size
=
1024
,
share_rnn_weights
=
True
,
rnn_direction
=
'forward'
,
blank_id
=
0
,
num_fc_layers
=
2
,
ctc_grad_norm_type
=
None
):
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
,
blank_id
=
0
,
ctc_grad_norm_type
=
None
,
):
super
().
__init__
()
super
().
__init__
()
self
.
encoder
=
CRNNEncoder
(
self
.
encoder
=
CRNNEncoder
(
feat_size
=
feat_size
,
feat_size
=
feat_size
,
dict_size
=
dict_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_direction
=
rnn_direction
,
num_fc_layers
=
num_fc_layers
,
fc_layers_size_list
=
fc_layers_size_list
,
rnn_size
=
rnn_size
,
rnn_size
=
rnn_size
,
use_gru
=
use_gru
,
use_gru
=
use_gru
)
share_rnn_weights
=
share_rnn_weights
)
assert
(
self
.
encoder
.
output_size
==
rnn_size
*
2
)
self
.
decoder
=
CTCDecoder
(
self
.
decoder
=
CTCDecoder
(
odim
=
dict_size
,
# <blank> is in vocab
odim
=
dict_size
,
# <blank> is in vocab
...
@@ -151,7 +279,7 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -151,7 +279,7 @@ class DeepSpeech2Model(nn.Layer):
"""Compute Model loss
"""Compute Model loss
Args:
Args:
audio (Tensor
s
): [B, T, D]
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
audio_len (Tensor): [B]
text (Tensor): [B, U]
text (Tensor): [B, U]
text_len (Tensor): [B]
text_len (Tensor): [B]
...
@@ -159,22 +287,22 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -159,22 +287,22 @@ class DeepSpeech2Model(nn.Layer):
Returns:
Returns:
loss (Tensor): [1]
loss (Tensor): [1]
"""
"""
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
return
loss
return
loss
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
decode
(
self
,
audio
,
audio_len
):
def
decode
(
self
,
audio
,
audio_len
):
# decoders only accept string encoded in utf-8
# decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized
# Make sure the decoder has been initialized
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
batch_size
=
probs
.
shape
[
0
]
batch_size
=
probs
.
shape
[
0
]
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
return
trans_best
return
trans_best
@
classmethod
@
classmethod
...
@@ -196,13 +324,15 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -196,13 +324,15 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result.
The model built from pretrained result.
"""
"""
model
=
cls
(
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
feat_size
=
dataloader
.
feat_dim
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
dict_size
=
dataloader
.
vocab_size
,
num_conv_layers
=
config
.
num_conv_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
share_rnn_weights
,
blank_id
=
config
.
blank_id
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
infos
=
Checkpoint
().
load_parameters
(
...
@@ -229,8 +359,10 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -229,8 +359,10 @@ class DeepSpeech2Model(nn.Layer):
num_conv_layers
=
config
.
num_conv_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
share_rnn_weights
,
blank_id
=
config
.
blank_id
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
return
model
return
model
...
@@ -240,28 +372,46 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
...
@@ -240,28 +372,46 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
audio
,
audio_len
):
def
forward
(
self
,
audio
_chunk
,
audio_chunk_lens
,
chunk_state_h_box
=
None
,
"""export model function
chunk_state_c_box
=
None
):
if
self
.
encoder
.
rnn_direction
==
"forward"
:
Args:
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio (Tensor): [B, T, D]
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
audio_len (Tensor): [B]
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
Returns
:
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
probs: probs after softmax
eouts
,
eouts_len
,
_
,
_
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
)
"""
probs
=
self
.
decoder
.
softmax
(
eouts
)
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
return
probs
,
eouts_len
probs
=
self
.
decoder
.
softmax
(
eouts
)
else
:
return
probs
,
eouts_len
raise
Exception
(
"wrong model type"
)
def
export
(
self
):
def
export
(
self
):
static_model
=
paddle
.
jit
.
to_static
(
if
self
.
encoder
.
rnn_direction
==
"forward"
:
self
,
static_model
=
paddle
.
jit
.
to_static
(
input_spec
=
[
self
,
paddle
.
static
.
InputSpec
(
input_spec
=
[
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
paddle
.
static
.
InputSpec
(
dtype
=
'float32'
),
# audio, [B,T,D]
shape
=
[
None
,
None
,
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
dtype
=
'int64'
),
# audio_length, [B]
dtype
=
'float32'
),
])
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
)
])
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
static_model
=
paddle
.
jit
.
to_static
(
self
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
dtype
=
'float32'
),
# audio, [B,T,D]
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
])
else
:
raise
Exception
(
"wrong model type"
)
return
static_model
return
static_model
paddlespeech/s2t/models/ds2/rnn.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
from
paddle.nn
import
initializer
as
I
from
paddlespeech.s2t.modules.activation
import
brelu
from
paddlespeech.s2t.modules.mask
import
make_non_pad_mask
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'RNNStack'
]
class
RNNCell
(
nn
.
RNNCellBase
):
r
"""
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
computes the outputs and updates states.
The formula used is as follows:
.. math::
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
y_{t} & = h_{t}
where :math:`act` is for :attr:`activation`.
"""
def
__init__
(
self
,
hidden_size
:
int
,
activation
=
"tanh"
,
weight_ih_attr
=
None
,
weight_hh_attr
=
None
,
bias_ih_attr
=
None
,
bias_hh_attr
=
None
,
name
=
None
):
super
().
__init__
()
std
=
1.0
/
math
.
sqrt
(
hidden_size
)
self
.
weight_hh
=
self
.
create_parameter
(
(
hidden_size
,
hidden_size
),
weight_hh_attr
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
bias_ih
=
None
self
.
bias_hh
=
self
.
create_parameter
(
(
hidden_size
,
),
bias_hh_attr
,
is_bias
=
True
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
hidden_size
=
hidden_size
if
activation
not
in
[
"tanh"
,
"relu"
,
"brelu"
]:
raise
ValueError
(
"activation for SimpleRNNCell should be tanh or relu, "
"but get {}"
.
format
(
activation
))
self
.
activation
=
activation
self
.
_activation_fn
=
paddle
.
tanh
\
if
activation
==
"tanh"
\
else
F
.
relu
if
activation
==
'brelu'
:
self
.
_activation_fn
=
brelu
def
forward
(
self
,
inputs
,
states
=
None
):
if
states
is
None
:
states
=
self
.
get_initial_states
(
inputs
,
self
.
state_shape
)
pre_h
=
states
i2h
=
inputs
if
self
.
bias_ih
is
not
None
:
i2h
+=
self
.
bias_ih
h2h
=
paddle
.
matmul
(
pre_h
,
self
.
weight_hh
,
transpose_y
=
True
)
if
self
.
bias_hh
is
not
None
:
h2h
+=
self
.
bias_hh
h
=
self
.
_activation_fn
(
i2h
+
h2h
)
return
h
,
h
@
property
def
state_shape
(
self
):
return
(
self
.
hidden_size
,
)
class
GRUCell
(
nn
.
RNNCellBase
):
r
"""
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
it computes the outputs and updates states.
The formula for GRU used is as follows:
.. math::
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
y_{t} & = h_{t}
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
multiplication operator.
"""
def
__init__
(
self
,
input_size
:
int
,
hidden_size
:
int
,
weight_ih_attr
=
None
,
weight_hh_attr
=
None
,
bias_ih_attr
=
None
,
bias_hh_attr
=
None
,
name
=
None
):
super
().
__init__
()
std
=
1.0
/
math
.
sqrt
(
hidden_size
)
self
.
weight_hh
=
self
.
create_parameter
(
(
3
*
hidden_size
,
hidden_size
),
weight_hh_attr
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
bias_ih
=
None
self
.
bias_hh
=
self
.
create_parameter
(
(
3
*
hidden_size
,
),
bias_hh_attr
,
is_bias
=
True
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
hidden_size
=
hidden_size
self
.
input_size
=
input_size
self
.
_gate_activation
=
F
.
sigmoid
self
.
_activation
=
paddle
.
tanh
def
forward
(
self
,
inputs
,
states
=
None
):
if
states
is
None
:
states
=
self
.
get_initial_states
(
inputs
,
self
.
state_shape
)
pre_hidden
=
states
x_gates
=
inputs
if
self
.
bias_ih
is
not
None
:
x_gates
=
x_gates
+
self
.
bias_ih
h_gates
=
paddle
.
matmul
(
pre_hidden
,
self
.
weight_hh
,
transpose_y
=
True
)
if
self
.
bias_hh
is
not
None
:
h_gates
=
h_gates
+
self
.
bias_hh
x_r
,
x_z
,
x_c
=
paddle
.
split
(
x_gates
,
num_or_sections
=
3
,
axis
=
1
)
h_r
,
h_z
,
h_c
=
paddle
.
split
(
h_gates
,
num_or_sections
=
3
,
axis
=
1
)
r
=
self
.
_gate_activation
(
x_r
+
h_r
)
z
=
self
.
_gate_activation
(
x_z
+
h_z
)
c
=
self
.
_activation
(
x_c
+
r
*
h_c
)
# apply reset gate after mm
h
=
(
pre_hidden
-
c
)
*
z
+
c
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
return
h
,
h
@
property
def
state_shape
(
self
):
r
"""
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
size would be automatically inserted into shape). The shape corresponds
to the shape of :math:`h_{t-1}`.
"""
return
(
self
.
hidden_size
,
)
class
BiRNNWithBN
(
nn
.
Layer
):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param size: Dimension of RNN cells.
:type size: int
:param share_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
:type share_weights: bool
:return: Bidirectional simple rnn layer.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
,
share_weights
:
bool
):
super
().
__init__
()
self
.
share_weights
=
share_weights
if
self
.
share_weights
:
#input-hidden weights shared between bi-directional rnn.
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
# batch norm is only performed on input-state projection
self
.
fw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
self
.
fw_fc
self
.
bw_bn
=
self
.
fw_bn
else
:
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
self
.
fw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
self
.
bw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
fw_cell
=
RNNCell
(
hidden_size
=
h_size
,
activation
=
'brelu'
)
self
.
bw_cell
=
RNNCell
(
hidden_size
=
h_size
,
activation
=
'brelu'
)
self
.
fw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
False
,
time_major
=
False
)
#[B, T, D]
self
.
bw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
True
,
time_major
=
False
)
#[B, T, D]
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
):
# x, shape [B, T, D]
fw_x
=
self
.
fw_bn
(
self
.
fw_fc
(
x
))
bw_x
=
self
.
bw_bn
(
self
.
bw_fc
(
x
))
fw_x
,
_
=
self
.
fw_rnn
(
inputs
=
fw_x
,
sequence_length
=
x_len
)
bw_x
,
_
=
self
.
bw_rnn
(
inputs
=
bw_x
,
sequence_length
=
x_len
)
x
=
paddle
.
concat
([
fw_x
,
bw_x
],
axis
=-
1
)
return
x
,
x_len
class
BiGRUWithBN
(
nn
.
Layer
):
"""Bidirectonal gru layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: Variable
:param size: Dimension of GRU cells.
:type size: int
:param act: Activation type.
:type act: string
:return: Bidirectional GRU layer.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
):
super
().
__init__
()
hidden_size
=
h_size
*
3
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
hidden_size
,
bias_attr
=
False
)
self
.
fw_bn
=
nn
.
BatchNorm1D
(
hidden_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
nn
.
Linear
(
i_size
,
hidden_size
,
bias_attr
=
False
)
self
.
bw_bn
=
nn
.
BatchNorm1D
(
hidden_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
fw_cell
=
GRUCell
(
input_size
=
hidden_size
,
hidden_size
=
h_size
)
self
.
bw_cell
=
GRUCell
(
input_size
=
hidden_size
,
hidden_size
=
h_size
)
self
.
fw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
False
,
time_major
=
False
)
#[B, T, D]
self
.
bw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
True
,
time_major
=
False
)
#[B, T, D]
def
forward
(
self
,
x
,
x_len
):
# x, shape [B, T, D]
fw_x
=
self
.
fw_bn
(
self
.
fw_fc
(
x
))
bw_x
=
self
.
bw_bn
(
self
.
bw_fc
(
x
))
fw_x
,
_
=
self
.
fw_rnn
(
inputs
=
fw_x
,
sequence_length
=
x_len
)
bw_x
,
_
=
self
.
bw_rnn
(
inputs
=
bw_x
,
sequence_length
=
x_len
)
x
=
paddle
.
concat
([
fw_x
,
bw_x
],
axis
=-
1
)
return
x
,
x_len
class
RNNStack
(
nn
.
Layer
):
"""RNN group with stacked bidirectional simple RNN or GRU layers.
:param input: Input layer.
:type input: Variable
:param size: Dimension of RNN cells in each layer.
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: Output layer of the RNN group.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
,
num_stacks
:
int
,
use_gru
:
bool
,
share_rnn_weights
:
bool
):
super
().
__init__
()
rnn_stacks
=
[]
for
i
in
range
(
num_stacks
):
if
use_gru
:
#default:GRU using tanh
rnn_stacks
.
append
(
BiGRUWithBN
(
i_size
=
i_size
,
h_size
=
h_size
))
else
:
rnn_stacks
.
append
(
BiRNNWithBN
(
i_size
=
i_size
,
h_size
=
h_size
,
share_weights
=
share_rnn_weights
))
i_size
=
h_size
*
2
self
.
rnn_stacks
=
nn
.
LayerList
(
rnn_stacks
)
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
):
"""
x: shape [B, T, D]
x_len: shpae [B]
"""
for
i
,
rnn
in
enumerate
(
self
.
rnn_stacks
):
x
,
x_len
=
rnn
(
x
,
x_len
)
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
-
1
)
# [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
paddlespeech/s2t/models/ds2_online/__init__.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.deepspeech2
import
DeepSpeech2InferModelOnline
from
.deepspeech2
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.utils
import
dynamic_pip_install
import
sys
try
:
import
paddlespeech_ctcdecoders
except
ImportError
:
try
:
package_name
=
'paddlespeech_ctcdecoders'
if
sys
.
platform
!=
"win32"
:
dynamic_pip_install
.
install
(
package_name
)
except
Exception
:
raise
RuntimeError
(
"Can not install package paddlespeech_ctcdecoders on your system.
\
The DeepSpeech2 model is not supported for your system"
)
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModelOnline'
]
paddlespeech/s2t/models/ds2_online/conv.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddlespeech.s2t.modules.subsampling
import
Conv2dSubsampling4
class
Conv2dSubsampling4Online
(
Conv2dSubsampling4
):
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
):
super
().
__init__
(
idim
,
odim
,
dropout_rate
,
None
)
self
.
output_dim
=
((
idim
-
1
)
//
2
-
1
)
//
2
*
odim
self
.
receptive_field_length
=
2
*
(
3
-
1
)
+
3
# stride_1 * (kernel_size_2 - 1) + kerel_size_1
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
)
->
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
x
=
self
.
conv
(
x
)
#b, c, t, f = paddle.shape(x) #not work under jit
x
=
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
0
,
0
,
-
1
])
x_len
=
((
x_len
-
1
)
//
2
-
1
)
//
2
return
x
,
x_len
paddlespeech/s2t/models/ds2_online/deepspeech2.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Deepspeech2 ASR Online Model"""
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddlespeech.s2t.models.ds2_online.conv
import
Conv2dSubsampling4Online
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModelOnline'
]
class
CRNNEncoder
(
nn
.
Layer
):
def
__init__
(
self
,
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
):
super
().
__init__
()
self
.
rnn_size
=
rnn_size
self
.
feat_size
=
feat_size
# 161 for linear
self
.
dict_size
=
dict_size
self
.
num_rnn_layers
=
num_rnn_layers
self
.
num_fc_layers
=
num_fc_layers
self
.
rnn_direction
=
rnn_direction
self
.
fc_layers_size_list
=
fc_layers_size_list
self
.
use_gru
=
use_gru
self
.
conv
=
Conv2dSubsampling4Online
(
feat_size
,
32
,
dropout_rate
=
0.0
)
self
.
output_dim
=
self
.
conv
.
output_dim
i_size
=
self
.
conv
.
output_dim
self
.
rnn
=
nn
.
LayerList
()
self
.
layernorm_list
=
nn
.
LayerList
()
self
.
fc_layers_list
=
nn
.
LayerList
()
if
rnn_direction
==
'bidirect'
or
rnn_direction
==
'bidirectional'
:
layernorm_size
=
2
*
rnn_size
elif
rnn_direction
==
'forward'
:
layernorm_size
=
rnn_size
else
:
raise
Exception
(
"Wrong rnn direction"
)
for
i
in
range
(
0
,
num_rnn_layers
):
if
i
==
0
:
rnn_input_size
=
i_size
else
:
rnn_input_size
=
layernorm_size
if
use_gru
is
True
:
self
.
rnn
.
append
(
nn
.
GRU
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
else
:
self
.
rnn
.
append
(
nn
.
LSTM
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
self
.
output_dim
=
layernorm_size
fc_input_size
=
layernorm_size
for
i
in
range
(
self
.
num_fc_layers
):
self
.
fc_layers_list
.
append
(
nn
.
Linear
(
fc_input_size
,
fc_layers_size_list
[
i
]))
fc_input_size
=
fc_layers_size_list
[
i
]
self
.
output_dim
=
fc_layers_size_list
[
i
]
@
property
def
output_size
(
self
):
return
self
.
output_dim
def
forward
(
self
,
x
,
x_lens
,
init_state_h_box
=
None
,
init_state_c_box
=
None
):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
Return:
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
if
init_state_h_box
is
not
None
:
init_state_list
=
None
if
self
.
use_gru
is
True
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
init_state_h_list
else
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
[(
init_state_h_list
[
i
],
init_state_c_list
[
i
])
for
i
in
range
(
self
.
num_rnn_layers
)]
else
:
init_state_list
=
[
None
]
*
self
.
num_rnn_layers
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
final_chunk_state_list
=
[]
for
i
in
range
(
0
,
self
.
num_rnn_layers
):
x
,
final_state
=
self
.
rnn
[
i
](
x
,
init_state_list
[
i
],
x_lens
)
#[B, T, D]
final_chunk_state_list
.
append
(
final_state
)
x
=
self
.
layernorm_list
[
i
](
x
)
for
i
in
range
(
self
.
num_fc_layers
):
x
=
self
.
fc_layers_list
[
i
](
x
)
x
=
F
.
relu
(
x
)
if
self
.
use_gru
is
True
:
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_list
,
axis
=
0
)
final_chunk_state_c_box
=
init_state_c_box
else
:
final_chunk_state_h_list
=
[
final_chunk_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_c_list
=
[
final_chunk_state_list
[
i
][
1
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_h_list
,
axis
=
0
)
final_chunk_state_c_box
=
paddle
.
concat
(
final_chunk_state_c_list
,
axis
=
0
)
return
x
,
x_lens
,
final_chunk_state_h_box
,
final_chunk_state_c_box
def
forward_chunk_by_chunk
(
self
,
x
,
x_lens
,
decoder_chunk_size
=
8
):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
decoder_chunk_size: The chunk size of decoder
Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate
=
self
.
conv
.
subsampling_rate
receptive_field_length
=
self
.
conv
.
receptive_field_length
chunk_size
=
(
decoder_chunk_size
-
1
)
*
subsampling_rate
+
receptive_field_length
chunk_stride
=
subsampling_rate
*
decoder_chunk_size
max_len
=
x
.
shape
[
1
]
assert
(
chunk_size
<=
max_len
)
eouts_chunk_list
=
[]
eouts_chunk_lens_list
=
[]
if
(
max_len
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len
=
chunk_stride
-
(
max_len
-
chunk_size
)
%
chunk_stride
else
:
padding_len
=
0
padding
=
paddle
.
zeros
((
x
.
shape
[
0
],
padding_len
,
x
.
shape
[
2
]))
padded_x
=
paddle
.
concat
([
x
,
padding
],
axis
=
1
)
num_chunk
=
(
max_len
+
padding_len
-
chunk_size
)
/
chunk_stride
+
1
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
None
chunk_state_c_box
=
None
final_state_h_box
=
None
final_state_c_box
=
None
for
i
in
range
(
0
,
num_chunk
):
start
=
i
*
chunk_stride
end
=
start
+
chunk_size
x_chunk
=
padded_x
[:,
start
:
end
,
:]
x_len_left
=
paddle
.
where
(
x_lens
-
i
*
chunk_stride
<
0
,
paddle
.
zeros_like
(
x_lens
),
x_lens
-
i
*
chunk_stride
)
x_chunk_len_tmp
=
paddle
.
ones_like
(
x_lens
)
*
chunk_size
x_chunk_lens
=
paddle
.
where
(
x_len_left
<
x_chunk_len_tmp
,
x_len_left
,
x_chunk_len_tmp
)
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
(
x_chunk
,
x_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
eouts_chunk_list
.
append
(
eouts_chunk
)
eouts_chunk_lens_list
.
append
(
eouts_chunk_lens
)
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
return
eouts_chunk_list
,
eouts_chunk_lens_list
,
final_state_h_box
,
final_state_c_box
class
DeepSpeech2ModelOnline
(
nn
.
Layer
):
"""The DeepSpeech2 network structure for online.
:param audio: Audio spectrogram data layer.
:type audio: Variable
:param text: Transcription text data layer.
:type text: Variable
:param audio_len: Valid sequence length data layer.
:type audio_len: Variable
:param feat_size: feature size for audio.
:type feat_size: int
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
def
__init__
(
self
,
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
,
blank_id
=
0
,
ctc_grad_norm_type
=
None
,
):
super
().
__init__
()
self
.
encoder
=
CRNNEncoder
(
feat_size
=
feat_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_direction
=
rnn_direction
,
num_fc_layers
=
num_fc_layers
,
fc_layers_size_list
=
fc_layers_size_list
,
rnn_size
=
rnn_size
,
use_gru
=
use_gru
)
self
.
decoder
=
CTCDecoder
(
odim
=
dict_size
,
# <blank> is in vocab
enc_n_units
=
self
.
encoder
.
output_size
,
blank_id
=
blank_id
,
dropout_rate
=
0.0
,
reduction
=
True
,
# sum
batch_average
=
True
,
# sum / batch_size
grad_norm_type
=
ctc_grad_norm_type
)
def
forward
(
self
,
audio
,
audio_len
,
text
,
text_len
):
"""Compute Model loss
Args:
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
text (Tensor): [B, U]
text_len (Tensor): [B]
Returns:
loss (Tensor): [1]
"""
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
return
loss
@
paddle
.
no_grad
()
def
decode
(
self
,
audio
,
audio_len
):
# decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
batch_size
=
probs
.
shape
[
0
]
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
return
trans_best
@
classmethod
def
from_pretrained
(
cls
,
dataloader
,
config
,
checkpoint_path
):
"""Build a DeepSpeech2Model model from a pretrained model.
Parameters
----------
dataloader: paddle.io.DataLoader
config: yacs.config.CfgNode
model configs
checkpoint_path: Path or str
the path of pretrained model checkpoint, without extension name
Returns
-------
DeepSpeech2ModelOnline
The model built from pretrained result.
"""
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
layer_tools
.
summary
(
model
)
return
model
@
classmethod
def
from_config
(
cls
,
config
):
"""Build a DeepSpeec2ModelOnline from config
Parameters
config: yacs.config.CfgNode
config
Returns
-------
DeepSpeech2ModelOnline
The model built from config.
"""
model
=
cls
(
feat_size
=
config
.
input_dim
,
dict_size
=
config
.
output_dim
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
return
model
class
DeepSpeech2InferModelOnline
(
DeepSpeech2ModelOnline
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
):
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
def
export
(
self
):
static_model
=
paddle
.
jit
.
to_static
(
self
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
)
])
return
static_model
paddlespeech/server/engine/asr/online/asr_engine.py
浏览文件 @
919c8d06
...
@@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
...
@@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
from
paddlespeech.cli.utils
import
MODEL_HOME
from
paddlespeech.cli.utils
import
MODEL_HOME
from
paddlespeech.resource
import
CommonTaskResource
from
paddlespeech.resource
import
CommonTaskResource
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.speech
import
SpeechSegment
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.transform.transformation
import
Transformation
from
paddlespeech.s2t.transform.transformation
import
Transformation
from
paddlespeech.s2t.utils.tensor_utils
import
add_sos_eos
from
paddlespeech.s2t.utils.tensor_utils
import
add_sos_eos
...
@@ -66,10 +65,13 @@ class PaddleASRConnectionHanddler:
...
@@ -66,10 +65,13 @@ class PaddleASRConnectionHanddler:
self
.
text_feature
=
self
.
asr_engine
.
executor
.
text_feature
self
.
text_feature
=
self
.
asr_engine
.
executor
.
text_feature
if
"deepspeech2"
in
self
.
model_type
:
if
"deepspeech2"
in
self
.
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
am_predictor
=
self
.
asr_engine
.
executor
.
am_predictor
self
.
am_predictor
=
self
.
asr_engine
.
executor
.
am_predictor
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
model_config
)
# extract feat, new only fbank in conformer model
self
.
preprocess_conf
=
self
.
model_config
.
preprocess_config
self
.
preprocess_args
=
{
"train"
:
False
}
self
.
preprocessing
=
Transformation
(
self
.
preprocess_conf
)
self
.
decoder
=
CTCDecoder
(
self
.
decoder
=
CTCDecoder
(
odim
=
self
.
model_config
.
output_dim
,
# <blank> is in vocab
odim
=
self
.
model_config
.
output_dim
,
# <blank> is in vocab
enc_n_units
=
self
.
model_config
.
rnn_layer_size
*
2
,
enc_n_units
=
self
.
model_config
.
rnn_layer_size
*
2
,
...
@@ -89,10 +91,8 @@ class PaddleASRConnectionHanddler:
...
@@ -89,10 +91,8 @@ class PaddleASRConnectionHanddler:
cfg
.
num_proc_bsearch
)
cfg
.
num_proc_bsearch
)
# frame window and frame shift, in samples unit
# frame window and frame shift, in samples unit
self
.
win_length
=
int
(
self
.
model_config
.
window_ms
/
1000
*
self
.
win_length
=
self
.
preprocess_conf
.
process
[
0
][
'win_length'
]
self
.
sample_rate
)
self
.
n_shift
=
self
.
preprocess_conf
.
process
[
0
][
'n_shift'
]
self
.
n_shift
=
int
(
self
.
model_config
.
stride_ms
/
1000
*
self
.
sample_rate
)
elif
"conformer"
in
self
.
model_type
or
"transformer"
in
self
.
model_type
:
elif
"conformer"
in
self
.
model_type
or
"transformer"
in
self
.
model_type
:
# acoustic model
# acoustic model
...
@@ -114,20 +114,15 @@ class PaddleASRConnectionHanddler:
...
@@ -114,20 +114,15 @@ class PaddleASRConnectionHanddler:
raise
ValueError
(
f
"Not supported:
{
self
.
model_type
}
"
)
raise
ValueError
(
f
"Not supported:
{
self
.
model_type
}
"
)
def
extract_feat
(
self
,
samples
):
def
extract_feat
(
self
,
samples
):
# we compute the elapsed time of first char occuring
# we compute the elapsed time of first char occuring
# and we record the start time at the first pcm sample arraving
# and we record the start time at the first pcm sample arraving
if
"deepspeech2online"
in
self
.
model_type
:
if
"deepspeech2online"
in
self
.
model_type
:
# self.reamined_wav stores all the samples,
# self.reamined_wav stores all the samples,
# include the original remained_wav and this package samples
# include the original remained_wav and this package samples
samples
=
np
.
frombuffer
(
samples
,
dtype
=
np
.
int16
)
samples
=
np
.
frombuffer
(
samples
,
dtype
=
np
.
int16
)
assert
samples
.
ndim
==
1
assert
samples
.
ndim
==
1
# pcm16 -> pcm 32
# pcm2float will change the orignal samples,
# so we shoule do pcm2float before concatenate
samples
=
pcm2float
(
samples
)
if
self
.
remained_wav
is
None
:
if
self
.
remained_wav
is
None
:
self
.
remained_wav
=
samples
self
.
remained_wav
=
samples
else
:
else
:
...
@@ -137,26 +132,11 @@ class PaddleASRConnectionHanddler:
...
@@ -137,26 +132,11 @@ class PaddleASRConnectionHanddler:
f
"The connection remain the audio samples:
{
self
.
remained_wav
.
shape
}
"
f
"The connection remain the audio samples:
{
self
.
remained_wav
.
shape
}
"
)
)
# read audio
# fbank
speech_segment
=
SpeechSegment
.
from_pcm
(
feat
=
self
.
preprocessing
(
self
.
remained_wav
,
self
.
remained_wav
,
self
.
sample_rate
,
transcript
=
" "
)
**
self
.
preprocess_args
)
# audio augment
feat
=
paddle
.
to_tensor
(
self
.
collate_fn_test
.
augmentation
.
transform_audio
(
speech_segment
)
feat
,
dtype
=
"float32"
).
unsqueeze
(
axis
=
0
)
# extract speech feature
spectrum
,
transcript_part
=
self
.
collate_fn_test
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
collate_fn_test
.
keep_transcription_text
)
# CMVN spectrum
if
self
.
collate_fn_test
.
_normalizer
:
spectrum
=
self
.
collate_fn_test
.
_normalizer
.
apply
(
spectrum
)
# spectrum augment
feat
=
self
.
collate_fn_test
.
augmentation
.
transform_feature
(
spectrum
)
# audio_len is frame num
frame_num
=
feat
.
shape
[
0
]
feat
=
paddle
.
to_tensor
(
feat
,
dtype
=
'float32'
)
feat
=
paddle
.
unsqueeze
(
feat
,
axis
=
0
)
if
self
.
cached_feat
is
None
:
if
self
.
cached_feat
is
None
:
self
.
cached_feat
=
feat
self
.
cached_feat
=
feat
...
@@ -170,8 +150,11 @@ class PaddleASRConnectionHanddler:
...
@@ -170,8 +150,11 @@ class PaddleASRConnectionHanddler:
if
self
.
device
is
None
:
if
self
.
device
is
None
:
self
.
device
=
self
.
cached_feat
.
place
self
.
device
=
self
.
cached_feat
.
place
self
.
num_frames
+=
frame_num
# cur frame step
self
.
remained_wav
=
self
.
remained_wav
[
self
.
n_shift
*
frame_num
:]
num_frames
=
feat
.
shape
[
1
]
self
.
num_frames
+=
num_frames
self
.
remained_wav
=
self
.
remained_wav
[
self
.
n_shift
*
num_frames
:]
logger
.
info
(
logger
.
info
(
f
"process the audio feature success, the connection feat shape:
{
self
.
cached_feat
.
shape
}
"
f
"process the audio feature success, the connection feat shape:
{
self
.
cached_feat
.
shape
}
"
...
@@ -190,7 +173,7 @@ class PaddleASRConnectionHanddler:
...
@@ -190,7 +173,7 @@ class PaddleASRConnectionHanddler:
f
"This package receive
{
samples
.
shape
[
0
]
}
pcm data. Global samples:
{
self
.
num_samples
}
"
f
"This package receive
{
samples
.
shape
[
0
]
}
pcm data. Global samples:
{
self
.
num_samples
}
"
)
)
# self.reamined_wav stores all the samples,
# self.reamined_wav stores all the samples,
# include the original remained_wav and this package samples
# include the original remained_wav and this package samples
if
self
.
remained_wav
is
None
:
if
self
.
remained_wav
is
None
:
self
.
remained_wav
=
samples
self
.
remained_wav
=
samples
...
@@ -246,7 +229,7 @@ class PaddleASRConnectionHanddler:
...
@@ -246,7 +229,7 @@ class PaddleASRConnectionHanddler:
def
reset
(
self
):
def
reset
(
self
):
if
"deepspeech2"
in
self
.
model_type
:
if
"deepspeech2"
in
self
.
model_type
:
# for deepspeech2
# for deepspeech2
# init state
# init state
self
.
chunk_state_h_box
=
np
.
zeros
(
self
.
chunk_state_h_box
=
np
.
zeros
(
(
self
.
model_config
.
num_rnn_layers
,
1
,
(
self
.
model_config
.
num_rnn_layers
,
1
,
...
@@ -275,7 +258,7 @@ class PaddleASRConnectionHanddler:
...
@@ -275,7 +258,7 @@ class PaddleASRConnectionHanddler:
## conformer
## conformer
# cache for conformer online
# cache for conformer online
self
.
subsampling_cache
=
None
self
.
subsampling_cache
=
None
self
.
elayers_output_cache
=
None
self
.
elayers_output_cache
=
None
self
.
conformer_cnn_cache
=
None
self
.
conformer_cnn_cache
=
None
...
@@ -359,7 +342,7 @@ class PaddleASRConnectionHanddler:
...
@@ -359,7 +342,7 @@ class PaddleASRConnectionHanddler:
# update feat cache
# update feat cache
self
.
cached_feat
=
self
.
cached_feat
[:,
end
-
cached_feature_num
:,
:]
self
.
cached_feat
=
self
.
cached_feat
[:,
end
-
cached_feature_num
:,
:]
# return trans_best[0]
# return trans_best[0]
elif
"conformer"
in
self
.
model_type
or
"transformer"
in
self
.
model_type
:
elif
"conformer"
in
self
.
model_type
or
"transformer"
in
self
.
model_type
:
try
:
try
:
logger
.
info
(
logger
.
info
(
...
@@ -565,7 +548,7 @@ class PaddleASRConnectionHanddler:
...
@@ -565,7 +548,7 @@ class PaddleASRConnectionHanddler:
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
rescoring
(
self
):
def
rescoring
(
self
):
"""Second-Pass Decoding,
"""Second-Pass Decoding,
only for conformer and transformer model.
only for conformer and transformer model.
"""
"""
if
"deepspeech2"
in
self
.
model_type
:
if
"deepspeech2"
in
self
.
model_type
:
...
@@ -652,11 +635,11 @@ class PaddleASRConnectionHanddler:
...
@@ -652,11 +635,11 @@ class PaddleASRConnectionHanddler:
## asr results
## asr results
# hyps[0][0]: the sentence word-id in the vocab with a tuple
# hyps[0][0]: the sentence word-id in the vocab with a tuple
# hyps[0][1]: the sentence decoding probability with all paths
# hyps[0][1]: the sentence decoding probability with all paths
## timestamp
## timestamp
# hyps[0][2]: viterbi_blank ending probability
# hyps[0][2]: viterbi_blank ending probability
# hyps[0][3]: viterbi_non_blank dending probability
# hyps[0][3]: viterbi_non_blank dending probability
# hyps[0][4]: current_token_prob,
# hyps[0][4]: current_token_prob,
# hyps[0][5]: times_viterbi_blank ending timestamp,
# hyps[0][5]: times_viterbi_blank ending timestamp,
# hyps[0][6]: times_titerbi_non_blank encding timestamp.
# hyps[0][6]: times_titerbi_non_blank encding timestamp.
self
.
hyps
=
[
hyps
[
best_index
][
0
]]
self
.
hyps
=
[
hyps
[
best_index
][
0
]]
logger
.
info
(
f
"best hyp ids:
{
self
.
hyps
}
"
)
logger
.
info
(
f
"best hyp ids:
{
self
.
hyps
}
"
)
...
@@ -752,16 +735,19 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -752,16 +735,19 @@ class ASRServerExecutor(ASRExecutor):
self
.
config
=
CfgNode
(
new_allowed
=
True
)
self
.
config
=
CfgNode
(
new_allowed
=
True
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
vocab
=
self
.
config
.
vocab_filepath
with
UpdateConfig
(
self
.
config
):
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2"
in
model_type
:
if
"deepspeech2"
in
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
...
@@ -772,14 +758,6 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -772,14 +758,6 @@ class ASRServerExecutor(ASRExecutor):
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
logger
.
info
(
"start to create the stream conformer asr engine"
)
logger
.
info
(
"start to create the stream conformer asr engine"
)
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
# update the decoding method
# update the decoding method
if
decode_method
:
if
decode_method
:
self
.
config
.
decode
.
decoding_method
=
decode_method
self
.
config
.
decode
.
decoding_method
=
decode_method
...
...
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
浏览文件 @
919c8d06
...
@@ -54,6 +54,7 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -54,6 +54,7 @@ class ASRServerExecutor(ASRExecutor):
self
.
max_len
=
50
self
.
max_len
=
50
sample_rate_str
=
'16k'
if
sample_rate
==
16000
else
'8k'
sample_rate_str
=
'16k'
if
sample_rate
==
16000
else
'8k'
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
self
.
max_len
=
50
self
.
task_resource
.
set_task_model
(
model_tag
=
tag
)
self
.
task_resource
.
set_task_model
(
model_tag
=
tag
)
if
cfg_path
is
None
or
am_model
is
None
or
am_params
is
None
:
if
cfg_path
is
None
or
am_model
is
None
or
am_params
is
None
:
self
.
res_path
=
self
.
task_resource
.
res_dir
self
.
res_path
=
self
.
task_resource
.
res_dir
...
@@ -80,22 +81,25 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -80,22 +81,25 @@ class ASRServerExecutor(ASRExecutor):
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
with
UpdateConfig
(
self
.
config
):
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
if
"deepspeech2"
in
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
vocab
=
self
.
config
.
vocab_filepath
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
self
.
download_lm
(
self
.
download_lm
(
lm_url
,
lm_url
,
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
or
"wenetspeech"
in
model_type
:
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
raise
Exception
(
"wrong type"
)
raise
Exception
(
"wrong type"
)
else
:
else
:
raise
Exception
(
"wrong type"
)
raise
Exception
(
"wrong type"
)
...
@@ -125,7 +129,7 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -125,7 +129,7 @@ class ASRServerExecutor(ASRExecutor):
cfg
=
self
.
config
.
decode
cfg
=
self
.
config
.
decode
audio
=
self
.
_inputs
[
"audio"
]
audio
=
self
.
_inputs
[
"audio"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
if
"deepspeech2
online"
in
model_type
or
"deepspeech2offline
"
in
model_type
:
if
"deepspeech2"
in
model_type
:
decode_batch_size
=
audio
.
shape
[
0
]
decode_batch_size
=
audio
.
shape
[
0
]
# init once
# init once
self
.
decoder
.
init_decoder
(
self
.
decoder
.
init_decoder
(
...
@@ -222,10 +226,9 @@ class PaddleASRConnectionHandler(ASRServerExecutor):
...
@@ -222,10 +226,9 @@ class PaddleASRConnectionHandler(ASRServerExecutor):
self
.
decoder
=
self
.
executor
.
decoder
self
.
decoder
=
self
.
executor
.
decoder
self
.
am_predictor
=
self
.
executor
.
am_predictor
self
.
am_predictor
=
self
.
executor
.
am_predictor
self
.
text_feature
=
self
.
executor
.
text_feature
self
.
text_feature
=
self
.
executor
.
text_feature
self
.
collate_fn_test
=
self
.
executor
.
collate_fn_test
def
run
(
self
,
audio_data
):
def
run
(
self
,
audio_data
):
"""engine run
"""engine run
Args:
Args:
audio_data (bytes): base64.b64decode
audio_data (bytes): base64.b64decode
...
...
paddlespeech/server/engine/tts/online/python/tts_engine.py
浏览文件 @
919c8d06
...
@@ -40,7 +40,7 @@ class TTSServerExecutor(TTSExecutor):
...
@@ -40,7 +40,7 @@ class TTSServerExecutor(TTSExecutor):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
task_resource
=
CommonTaskResource
(
self
.
task_resource
=
CommonTaskResource
(
task
=
'tts'
,
model_format
=
'
stat
ic'
,
inference_mode
=
'online'
)
task
=
'tts'
,
model_format
=
'
dynam
ic'
,
inference_mode
=
'online'
)
def
get_model_info
(
self
,
def
get_model_info
(
self
,
...
...
speechx/CMakeLists.txt
浏览文件 @
919c8d06
...
@@ -142,4 +142,3 @@ set(DEPS ${DEPS}
...
@@ -142,4 +142,3 @@ set(DEPS ${DEPS}
set
(
SPEECHX_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/speechx
)
set
(
SPEECHX_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/speechx
)
add_subdirectory
(
speechx
)
add_subdirectory
(
speechx
)
add_subdirectory
(
examples
)
speechx/examples/custom_asr/run.sh
浏览文件 @
919c8d06
...
@@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
recognizer_test_main
\
recognizer_test_main
\
--wav_rspecifier
=
scp:
$wav_scp
\
--wav_rspecifier
=
scp:
$wav_scp
\
--cmvn_file
=
$cmvn
\
--cmvn_file
=
$cmvn
\
--streaming_chunk
=
30
\
--use_fbank
=
true
\
--use_fbank
=
true
\
--model_path
=
$model_dir
/avg_10.jit.pdmodel
\
--model_path
=
$model_dir
/avg_10.jit.pdmodel
\
--param_path
=
$model_dir
/avg_10.jit.pdiparams
\
--param_path
=
$model_dir
/avg_10.jit.pdiparams
\
...
...
speechx/examples/ds2_ol/README.md
浏览文件 @
919c8d06
...
@@ -2,13 +2,5 @@
...
@@ -2,13 +2,5 @@
## Examples
## Examples
*
`websocket`
- Streaming ASR with websocket.
*
`websocket`
- Streaming ASR with websocket for deepspeech2_aishell.
*
`aishell`
- Streaming Decoding under aishell dataset, for local WER test.
*
`aishell`
- Streaming Decoding under aishell dataset, for local WER test.
\ No newline at end of file
## More
> The below is for developing and offline testing. Do not run it only if you know what it is.
*
nnet
*
feat
*
decoder
speechx/examples/ds2_ol/aishell/path.sh
浏览文件 @
919c8d06
...
@@ -20,5 +20,5 @@ export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
...
@@ -20,5 +20,5 @@ export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
:-}
:
${
LIBLBFGS
}
/lib/.libs
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
:-}
:
${
LIBLBFGS
}
/lib/.libs
export
SRILM
=
${
MAIN_ROOT
}
/tools/srilm
export
SRILM
=
${
MAIN_ROOT
}
/tools/srilm
SPEECHX_BIN
=
$SPEECHX_BUILD
/decoder:
$SPEECHX_BUILD
/frontend/audio
:
$SPEECHX_BUILD
/websocket
SPEECHX_BIN
=
$SPEECHX_BUILD
/decoder:
$SPEECHX_BUILD
/frontend/audio
export
PATH
=
$PATH
:
$SPEECHX_BIN
:
$TOOLS_BIN
:
${
SRILM
}
/bin:
${
SRILM
}
/bin/i686-m64:
$KALDI_DIR
/lmbin:
$KALDI_DIR
/fstbin:
$OPENFST_DIR
/bin
export
PATH
=
$PATH
:
$SPEECHX_BIN
:
$TOOLS_BIN
:
${
SRILM
}
/bin:
${
SRILM
}
/bin/i686-m64:
$KALDI_DIR
/lmbin:
$KALDI_DIR
/fstbin:
$OPENFST_DIR
/bin
speechx/examples/ds2_ol/aishell/run.sh
浏览文件 @
919c8d06
...
@@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...
@@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--feature_wspecifier
=
ark,scp:
$data
/split
${
nj
}
/JOB/feat.ark,
$data
/split
${
nj
}
/JOB/feat.scp
\
--feature_wspecifier
=
ark,scp:
$data
/split
${
nj
}
/JOB/feat.ark,
$data
/split
${
nj
}
/JOB/feat.scp
\
--cmvn_file
=
$cmvn
\
--cmvn_file
=
$cmvn
\
--streaming_chunk
=
0.36
echo
"feature make have finished!!!"
echo
"feature make have finished!!!"
fi
fi
...
@@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...
@@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--cmvn_file
=
$cmvn
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--streaming_chunk
=
30
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
...
...
speechx/examples/ds2_ol/aishell/run_fbank.sh
浏览文件 @
919c8d06
...
@@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...
@@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--cmvn_file
=
$cmvn
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_5.jit.pdmodel
\
--model_path
=
$model_dir
/avg_5.jit.pdmodel
\
--streaming_chunk
=
30
\
--use_fbank
=
true
\
--use_fbank
=
true
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--word_symbol_table
=
$wfst
/words.txt
\
...
...
speechx/examples/ds2_ol/websocket/path.sh
浏览文件 @
919c8d06
...
@@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
...
@@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
export
LC_AL
=
C
export
LC_AL
=
C
SPEECHX_BIN
=
$SPEECHX_BUILD
/protocol/websocket
SPEECHX_BIN
=
$SPEECHX_BUILD
/protocol/websocket
:
$SPEECHX_BUILD
/frontend/audio
export
PATH
=
$PATH
:
$SPEECHX_BIN
:
$TOOLS_BIN
export
PATH
=
$PATH
:
$SPEECHX_BIN
:
$TOOLS_BIN
speechx/examples/ds2_ol/websocket/websocket_client.sh
浏览文件 @
919c8d06
...
@@ -32,4 +32,4 @@ export GLOG_logtostderr=1
...
@@ -32,4 +32,4 @@ export GLOG_logtostderr=1
# websocket client
# websocket client
websocket_client_main
\
websocket_client_main
\
--wav_rspecifier
=
scp:
$data
/
$aishell_wav_scp
--streaming_chunk
=
0.36
--wav_rspecifier
=
scp:
$data
/
$aishell_wav_scp
--streaming_chunk
=
0.5
\ No newline at end of file
speechx/examples/ds2_ol/websocket/websocket_server.sh
浏览文件 @
919c8d06
...
@@ -4,7 +4,6 @@ set -e
...
@@ -4,7 +4,6 @@ set -e
.
path.sh
.
path.sh
# 1. compile
# 1. compile
if
[
!
-d
${
SPEECHX_EXAMPLES
}
]
;
then
if
[
!
-d
${
SPEECHX_EXAMPLES
}
]
;
then
pushd
${
SPEECHX_ROOT
}
pushd
${
SPEECHX_ROOT
}
...
@@ -19,19 +18,6 @@ ckpt_dir=$data/model
...
@@ -19,19 +18,6 @@ ckpt_dir=$data/model
model_dir
=
$ckpt_dir
/exp/deepspeech2_online/checkpoints/
model_dir
=
$ckpt_dir
/exp/deepspeech2_online/checkpoints/
vocb_dir
=
$ckpt_dir
/data/lang_char/
vocb_dir
=
$ckpt_dir
/data/lang_char/
# output
aishell_wav_scp
=
aishell_test.scp
if
[
!
-d
$data
/test
]
;
then
pushd
$data
wget
-c
https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip aishell_test.zip
popd
realpath
$data
/test/
*
/
*
.wav
>
$data
/wavlist
awk
-F
'/'
'{ print $(NF) }'
$data
/wavlist |
awk
-F
'.'
'{ print $1 }'
>
$data
/utt_id
paste
$data
/utt_id
$data
/wavlist
>
$data
/
$aishell_wav_scp
fi
if
[
!
-f
$ckpt_dir
/data/mean_std.json
]
;
then
if
[
!
-f
$ckpt_dir
/data/mean_std.json
]
;
then
mkdir
-p
$ckpt_dir
mkdir
-p
$ckpt_dir
...
@@ -62,7 +48,6 @@ fi
...
@@ -62,7 +48,6 @@ fi
websocket_server_main
\
websocket_server_main
\
--cmvn_file
=
$cmvn
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--streaming_chunk
=
0.1
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
...
...
speechx/speechx/decoder/param.h
浏览文件 @
919c8d06
...
@@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
...
@@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
// feature, or fbank");
// feature, or fbank");
DEFINE_int32
(
num_bins
,
161
,
"num bins of mel"
);
DEFINE_int32
(
num_bins
,
161
,
"num bins of mel"
);
DEFINE_string
(
cmvn_file
,
""
,
"read cmvn"
);
DEFINE_string
(
cmvn_file
,
""
,
"read cmvn"
);
DEFINE_double
(
streaming_chunk
,
0.1
,
"streaming feature chunk size"
);
// feature sliding window
// feature sliding window
DEFINE_int32
(
receptive_field_length
,
DEFINE_int32
(
receptive_field_length
,
7
,
7
,
...
@@ -62,7 +61,6 @@ namespace ppspeech {
...
@@ -62,7 +61,6 @@ namespace ppspeech {
FeaturePipelineOptions
InitFeaturePipelineOptions
()
{
FeaturePipelineOptions
InitFeaturePipelineOptions
()
{
FeaturePipelineOptions
opts
;
FeaturePipelineOptions
opts
;
opts
.
cmvn_file
=
FLAGS_cmvn_file
;
opts
.
cmvn_file
=
FLAGS_cmvn_file
;
opts
.
linear_spectrogram_opts
.
streaming_chunk
=
FLAGS_streaming_chunk
;
kaldi
::
FrameExtractionOptions
frame_opts
;
kaldi
::
FrameExtractionOptions
frame_opts
;
frame_opts
.
dither
=
0.0
;
frame_opts
.
dither
=
0.0
;
frame_opts
.
frame_shift_ms
=
10
;
frame_opts
.
frame_shift_ms
=
10
;
...
@@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
...
@@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts
.
to_float32
=
false
;
opts
.
to_float32
=
false
;
frame_opts
.
window_type
=
"povey"
;
frame_opts
.
window_type
=
"povey"
;
frame_opts
.
frame_length_ms
=
25
;
frame_opts
.
frame_length_ms
=
25
;
opts
.
fbank_opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
f
bank_opts
.
f
rame_opts
=
frame_opts
;
opts
.
fbank_opts
.
frame_opts
=
frame_opts
;
}
else
{
}
else
{
opts
.
to_float32
=
true
;
opts
.
to_float32
=
true
;
frame_opts
.
remove_dc_offset
=
false
;
frame_opts
.
remove_dc_offset
=
false
;
...
...
speechx/speechx/decoder/recognizer_main.cc
浏览文件 @
919c8d06
...
@@ -19,6 +19,7 @@
...
@@ -19,6 +19,7 @@
DEFINE_string
(
wav_rspecifier
,
""
,
"test feature rspecifier"
);
DEFINE_string
(
wav_rspecifier
,
""
,
"test feature rspecifier"
);
DEFINE_string
(
result_wspecifier
,
""
,
"test result wspecifier"
);
DEFINE_string
(
result_wspecifier
,
""
,
"test result wspecifier"
);
DEFINE_double
(
streaming_chunk
,
0.36
,
"streaming feature chunk size"
);
DEFINE_int32
(
sample_rate
,
16000
,
"sample rate"
);
DEFINE_int32
(
sample_rate
,
16000
,
"sample rate"
);
int
main
(
int
argc
,
char
*
argv
[])
{
int
main
(
int
argc
,
char
*
argv
[])
{
...
@@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
...
@@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
KALDI_LOG
<<
" cost:"
<<
elapsed
<<
" s"
;
KALDI_LOG
<<
" cost:"
<<
elapsed
<<
" s"
;
KALDI_LOG
<<
"total wav duration is: "
<<
tot_wav_duration
<<
" s"
;
KALDI_LOG
<<
"total wav duration is: "
<<
tot_wav_duration
<<
" s"
;
KALDI_LOG
<<
"the RTF is: "
<<
elapsed
/
tot_wav_duration
;
KALDI_LOG
<<
"the RTF is: "
<<
elapsed
/
tot_wav_duration
;
}
}
\ No newline at end of file
speechx/speechx/frontend/audio/audio_cache.h
浏览文件 @
919c8d06
...
@@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
...
@@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
// the audio dim is 1, one sample
// the audio dim is 1, one sample, which is useless,
virtual
size_t
Dim
()
const
{
return
1
;
}
// so we return size_(cache samples) instead.
virtual
size_t
Dim
()
const
{
return
size_
;
}
virtual
void
SetFinished
()
{
virtual
void
SetFinished
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
...
...
speechx/speechx/frontend/audio/compute_fbank_main.cc
浏览文件 @
919c8d06
...
@@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
...
@@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
data_source
(
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
data_source
(
new
ppspeech
::
AudioCache
(
3600
*
1600
,
false
));
new
ppspeech
::
AudioCache
(
3600
*
1600
,
false
));
ppspeech
::
FbankOptions
opt
;
kaldi
::
FbankOptions
opt
;
opt
.
fbank_opts
.
frame_opts
.
frame_length_ms
=
25
;
opt
.
frame_opts
.
frame_length_ms
=
25
;
opt
.
fbank_opts
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opt
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opt
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opt
.
frame_opts
.
dither
=
0.0
;
opt
.
fbank_opts
.
frame_opts
.
dither
=
0.0
;
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
fbank
(
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
fbank
(
new
ppspeech
::
Fbank
(
opt
,
std
::
move
(
data_source
)));
new
ppspeech
::
Fbank
(
opt
,
std
::
move
(
data_source
)));
...
...
speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
浏览文件 @
919c8d06
...
@@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
...
@@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
ppspeech
::
LinearSpectrogramOptions
opt
;
ppspeech
::
LinearSpectrogramOptions
opt
;
opt
.
frame_opts
.
frame_length_ms
=
20
;
opt
.
frame_opts
.
frame_length_ms
=
20
;
opt
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opt
.
frame_opts
.
dither
=
0.0
;
opt
.
frame_opts
.
dither
=
0.0
;
opt
.
frame_opts
.
remove_dc_offset
=
false
;
opt
.
frame_opts
.
remove_dc_offset
=
false
;
opt
.
frame_opts
.
window_type
=
"hanning"
;
opt
.
frame_opts
.
window_type
=
"hanning"
;
...
...
speechx/speechx/frontend/audio/fbank.cc
浏览文件 @
919c8d06
...
@@ -12,7 +12,6 @@
...
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "frontend/audio/fbank.h"
#include "frontend/audio/fbank.h"
#include "kaldi/base/kaldi-math.h"
#include "kaldi/base/kaldi-math.h"
#include "kaldi/feat/feature-common.h"
#include "kaldi/feat/feature-common.h"
...
@@ -29,95 +28,33 @@ using kaldi::VectorBase;
...
@@ -29,95 +28,33 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
kaldi
::
Matrix
;
using
std
::
vector
;
using
std
::
vector
;
// todo refactor later:(SmileGoat)
FbankComputer
::
FbankComputer
(
const
Options
&
opts
)
Fbank
::
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
:
opts_
(
opts
),
computer_
(
opts
.
fbank_opts
),
computer_
(
opts
)
{}
window_function_
(
opts
.
fbank_opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
chunk_sample_size_
=
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
fbank_opts
.
frame_opts
.
samp_freq
);
}
void
Fbank
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
int32
FbankComputer
::
Dim
()
const
{
base_extractor_
->
Accept
(
inputs
);
return
opts_
.
mel_opts
.
num_bins
+
(
opts_
.
use_energy
?
1
:
0
);
}
}
bool
Fbank
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
bool
FbankComputer
::
NeedRawLogEnergy
()
{
Vector
<
BaseFloat
>
wav
(
chunk_sample_size_
);
return
opts_
.
use_energy
&&
opts_
.
raw_energy
;
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
}
// Compute spectrogram feat
// Compute feat
bool
Fbank
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
bool
FbankComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
RealFft
(
window
,
true
);
computer_
.
GetFrameOptions
();
kaldi
::
ComputePowerSpectrum
(
window
);
int32
num_samples
=
waves
.
Dim
();
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
int32
frame_length
=
frame_opts
.
WindowSize
();
SubVector
<
BaseFloat
>
power_spectrum
(
*
window
,
0
,
window
->
Dim
()
/
2
+
1
);
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
!
opts_
.
use_power
)
{
if
(
num_samples
<
frame_length
)
{
power_spectrum
.
ApplyPow
(
0.5
);
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Resize
(
num_frames
*
Dim
());
Vector
<
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
// note: this online feature-extraction code does not support VTLN.
RealFft
(
&
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
window
.
Dim
()
/
2
+
1
);
if
(
!
opts_
.
fbank_opts
.
use_power
)
{
power_spectrum
.
ApplyPow
(
0.5
);
}
int32
mel_offset
=
((
opts_
.
fbank_opts
.
use_energy
&&
!
opts_
.
fbank_opts
.
htk_compat
)
?
1
:
0
);
SubVector
<
BaseFloat
>
mel_energies
(
this_feature
,
mel_offset
,
opts_
.
fbank_opts
.
mel_opts
.
num_bins
);
mel_bank
.
Compute
(
power_spectrum
,
&
mel_energies
);
mel_energies
.
ApplyFloor
(
1e-07
);
mel_energies
.
ApplyLog
();
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
}
int32
mel_offset
=
((
opts_
.
use_energy
&&
!
opts_
.
htk_compat
)
?
1
:
0
);
SubVector
<
BaseFloat
>
mel_energies
(
*
feat
,
mel_offset
,
opts_
.
mel_opts
.
num_bins
);
mel_bank
.
Compute
(
power_spectrum
,
&
mel_energies
);
mel_energies
.
ApplyFloor
(
1e-07
);
mel_energies
.
ApplyLog
();
return
true
;
return
true
;
}
}
...
...
speechx/speechx/frontend/audio/fbank.h
浏览文件 @
919c8d06
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#pragma once
#pragma once
#include "base/common.h"
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-mfcc.h"
#include "kaldi/feat/feature-mfcc.h"
...
@@ -22,56 +23,28 @@
...
@@ -22,56 +23,28 @@
namespace
ppspeech
{
namespace
ppspeech
{
struct
FbankOptions
{
class
FbankComputer
{
kaldi
::
FbankOptions
fbank_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
FbankOptions
()
:
streaming_chunk
(
0.1
),
fbank_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
fbank_opts
.
Register
(
opts
);
}
};
class
Fbank
:
public
FrontendInterface
{
public:
public:
explicit
Fbank
(
const
FbankOptions
&
opts
,
typedef
kaldi
::
FbankOptions
Options
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
explicit
FbankComputer
(
const
Options
&
opts
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
kaldi
::
FrameExtractionOptions
&
GetFrameOptions
()
{
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
return
opts_
.
frame_opts
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
window
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
int32
Dim
()
const
;
virtual
void
Reset
()
{
bool
NeedRawLogEnergy
();
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
private:
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
Options
opts_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
FbankOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
FeatureWindowFunction
window_function_
;
kaldi
::
FbankComputer
computer_
;
kaldi
::
FbankComputer
computer_
;
// features_ is the Mfcc or Plp or Fbank features that we have already
DISALLOW_COPY_AND_ASSIGN
(
FbankComputer
);
// computed.
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
features_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
kaldi
::
int32
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
};
};
typedef
StreamingFeatureTpl
<
FbankComputer
>
Fbank
;
}
// namespace ppspeech
}
// namespace ppspeech
speechx/speechx/frontend/audio/feature_common.h
0 → 100644
浏览文件 @
919c8d06
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "frontend_itf.h"
#include "kaldi/feat/feature-window.h"
namespace
ppspeech
{
template
<
class
F
>
class
StreamingFeatureTpl
:
public
FrontendInterface
{
public:
typedef
typename
F
::
Options
Options
;
StreamingFeatureTpl
(
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
Options
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
FeatureWindowFunction
window_function_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
F
computer_
;
};
}
// namespace ppspeech
#include "frontend/audio/feature_common_inl.h"
speechx/speechx/frontend/audio/feature_common_inl.h
0 → 100644
浏览文件 @
919c8d06
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace
ppspeech
{
template
<
class
F
>
StreamingFeatureTpl
<
F
>::
StreamingFeatureTpl
(
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
computer_
(
opts
),
window_function_
(
opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
}
template
<
class
F
>
void
StreamingFeatureTpl
<
F
>::
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
)
{
base_extractor_
->
Accept
(
waves
);
}
template
<
class
F
>
bool
StreamingFeatureTpl
<
F
>::
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
wav
(
base_extractor_
->
Dim
());
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
// Compute feat
template
<
class
F
>
bool
StreamingFeatureTpl
<
F
>::
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Resize
(
num_frames
*
Dim
());
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
kaldi
::
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
computer_
.
Compute
(
&
window
,
&
this_feature
);
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
return
true
;
}
}
// namespace ppspeech
speechx/speechx/frontend/audio/feature_pipeline.h
浏览文件 @
919c8d06
...
@@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
...
@@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
bool
to_float32
;
// true, only for linear feature
bool
to_float32
;
// true, only for linear feature
bool
use_fbank
;
bool
use_fbank
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
FbankOptions
fbank_opts
;
kaldi
::
FbankOptions
fbank_opts
;
FeatureCacheOptions
feature_cache_opts
;
FeatureCacheOptions
feature_cache_opts
;
AssemblerOptions
assembler_opts
;
AssemblerOptions
assembler_opts
;
...
...
speechx/speechx/frontend/audio/linear_spectrogram.cc
浏览文件 @
919c8d06
...
@@ -28,81 +28,32 @@ using kaldi::VectorBase;
...
@@ -28,81 +28,32 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
kaldi
::
Matrix
;
using
std
::
vector
;
using
std
::
vector
;
LinearSpectrogram
::
LinearSpectrogram
(
LinearSpectrogramComputer
::
LinearSpectrogramComputer
(
const
LinearSpectrogramOptions
&
opts
,
const
Options
&
opts
)
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
)
{
:
opts_
(
opts
),
feature_window_funtion_
(
opts
.
frame_opts
)
{
kaldi
::
FeatureWindowFunction
feature_window_function
(
opts
.
frame_opts
);
base_extractor_
=
std
::
move
(
base_extractor
);
int32
window_size
=
opts
.
frame_opts
.
WindowSize
();
int32
window_size
=
opts
.
frame_opts
.
WindowSize
();
int32
window_shift
=
opts
.
frame_opts
.
WindowShift
()
;
frame_length_
=
window_size
;
dim_
=
window_size
/
2
+
1
;
dim_
=
window_size
/
2
+
1
;
chunk_sample_size_
=
BaseFloat
hanning_window_energy
=
kaldi
::
VecVec
(
feature_window_function
.
window
,
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
frame_opts
.
samp_freq
);
feature_window_function
.
window
);
hanning_window_energy_
=
kaldi
::
VecVec
(
feature_window_funtion_
.
window
,
int32
sample_rate
=
opts
.
frame_opts
.
samp_freq
;
feature_window_funtion_
.
window
);
scale_
=
2.0
/
(
hanning_window_energy
*
sample_rate
);
}
void
LinearSpectrogram
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
base_extractor_
->
Accept
(
inputs
);
}
bool
LinearSpectrogram
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>
input_feats
(
chunk_sample_size_
);
bool
flag
=
base_extractor_
->
Read
(
&
input_feats
);
if
(
flag
==
false
||
input_feats
.
Dim
()
==
0
)
return
false
;
int32
feat_len
=
input_feats
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
feat_len
+
left_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
feat_len
).
CopyFromVec
(
input_feats
);
Compute
(
waves
,
feats
);
int32
frame_shift
=
opts_
.
frame_opts
.
WindowShift
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
opts_
.
frame_opts
);
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
}
// Compute spectrogram feat
// Compute spectrogram feat
bool
LinearSpectrogram
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
bool
LinearSpectrogramComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>*
feat
)
{
int32
num_samples
=
waves
.
Dim
();
window
->
Resize
(
frame_length_
,
kaldi
::
kCopyData
);
int32
frame_length
=
opts_
.
frame_opts
.
WindowSize
();
RealFft
(
window
,
true
);
int32
sample_rate
=
opts_
.
frame_opts
.
samp_freq
;
kaldi
::
ComputePowerSpectrum
(
window
);
BaseFloat
scale
=
2.0
/
(
hanning_window_energy_
*
sample_rate
);
SubVector
<
BaseFloat
>
power_spectrum
(
*
window
,
0
,
dim_
);
power_spectrum
.
Scale
(
scale_
);
if
(
num_samples
<
frame_length
)
{
power_spectrum
(
0
)
=
power_spectrum
(
0
)
/
2
;
return
true
;
power_spectrum
(
dim_
-
1
)
=
power_spectrum
(
dim_
-
1
)
/
2
;
}
power_spectrum
.
Add
(
1e-14
);
power_spectrum
.
ApplyLog
();
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
opts_
.
frame_opts
);
feat
->
CopyFromVec
(
power_spectrum
);
feats
->
Resize
(
num_frames
*
dim_
);
Vector
<
BaseFloat
>
window
;
for
(
int
frame_idx
=
0
;
frame_idx
<
num_frames
;
++
frame_idx
)
{
kaldi
::
ExtractWindow
(
0
,
waves
,
frame_idx
,
opts_
.
frame_opts
,
feature_window_funtion_
,
&
window
,
NULL
);
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame_idx
*
dim_
,
dim_
);
window
.
Resize
(
frame_length
,
kaldi
::
kCopyData
);
RealFft
(
&
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
dim_
);
power_spectrum
.
Scale
(
scale
);
power_spectrum
(
0
)
=
power_spectrum
(
0
)
/
2
;
power_spectrum
(
dim_
-
1
)
=
power_spectrum
(
dim_
-
1
)
/
2
;
power_spectrum
.
Add
(
1e-14
);
power_spectrum
.
ApplyLog
();
output_row
.
CopyFromVec
(
power_spectrum
);
}
return
true
;
return
true
;
}
}
...
...
speechx/speechx/frontend/audio/linear_spectrogram.h
浏览文件 @
919c8d06
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#pragma once
#pragma once
#include "base/common.h"
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-window.h"
#include "kaldi/feat/feature-window.h"
...
@@ -23,47 +24,34 @@ namespace ppspeech {
...
@@ -23,47 +24,34 @@ namespace ppspeech {
struct
LinearSpectrogramOptions
{
struct
LinearSpectrogramOptions
{
kaldi
::
FrameExtractionOptions
frame_opts
;
kaldi
::
FrameExtractionOptions
frame_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
LinearSpectrogramOptions
()
:
frame_opts
()
{}
LinearSpectrogramOptions
()
:
streaming_chunk
(
0.1
),
frame_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
frame_opts
.
Register
(
opts
);
}
};
};
class
LinearSpectrogram
:
public
FrontendInterface
{
class
LinearSpectrogram
Computer
{
public:
public:
explicit
LinearSpectrogram
(
typedef
LinearSpectrogramOptions
Options
;
const
LinearSpectrogramOptions
&
opts
,
explicit
LinearSpectrogramComputer
(
const
Options
&
opts
);
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
kaldi
::
FrameExtractionOptions
&
GetFrameOptions
()
{
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
return
opts_
.
frame_opts
;
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
}
private:
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
window
,
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
size_t
dim_
;
int32
Dim
()
const
{
return
dim_
;
}
kaldi
::
FeatureWindowFunction
feature_window_funtion_
;
kaldi
::
BaseFloat
hanning_window_energy_
;
bool
NeedRawLogEnergy
()
{
return
false
;
}
LinearSpectrogramOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
private:
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
kaldi
::
BaseFloat
scale_
;
int
chunk_sample_size_
;
Options
opts_
;
DISALLOW_COPY_AND_ASSIGN
(
LinearSpectrogram
);
int32
frame_length_
;
int32
dim_
;
DISALLOW_COPY_AND_ASSIGN
(
LinearSpectrogramComputer
);
};
};
typedef
StreamingFeatureTpl
<
LinearSpectrogramComputer
>
LinearSpectrogram
;
}
// namespace ppspeech
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/utils/CMakeLists.txt
浏览文件 @
919c8d06
add_library
(
utils
add_library
(
utils
file_utils.cc
file_utils.cc
simdjson.cpp
)
)
\ No newline at end of file
speechx/speechx/utils/simdjson.cpp
已删除
100644 → 0
浏览文件 @
8b1c1ec4
此差异已折叠。
点击以展开。
speechx/speechx/utils/simdjson.h
已删除
100644 → 0
浏览文件 @
8b1c1ec4
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录