Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
919c8d06
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
919c8d06
编写于
6月 05, 2022
作者:
L
liangym
提交者:
lym0302
6月 05, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'PaddlePaddle:develop' into update_engine
上级
8b1c1ec4
eea56a4a
变更
74
展开全部
显示空白变更内容
内联
并排
Showing
74 changed file
with
1011 addition
and
55785 deletion
+1011
-55785
demos/streaming_asr_server/web/templates/index.html
demos/streaming_asr_server/web/templates/index.html
+1
-0
docker/ubuntu18-cpu/Dockerfile
docker/ubuntu18-cpu/Dockerfile
+15
-0
examples/aishell/asr0/conf/augmentation.json
examples/aishell/asr0/conf/augmentation.json
+0
-36
examples/aishell/asr0/conf/deepspeech2.yaml
examples/aishell/asr0/conf/deepspeech2.yaml
+31
-28
examples/aishell/asr0/conf/deepspeech2_online.yaml
examples/aishell/asr0/conf/deepspeech2_online.yaml
+19
-20
examples/aishell/asr0/conf/preprocess.yaml
examples/aishell/asr0/conf/preprocess.yaml
+25
-0
examples/aishell/asr0/conf/tuning/decode.yaml
examples/aishell/asr0/conf/tuning/decode.yaml
+3
-3
examples/aishell/asr0/local/data.sh
examples/aishell/asr0/local/data.sh
+4
-3
examples/aishell/asr0/local/export.sh
examples/aishell/asr0/local/export.sh
+3
-5
examples/aishell/asr0/local/test.sh
examples/aishell/asr0/local/test.sh
+13
-15
examples/aishell/asr0/local/test_export.sh
examples/aishell/asr0/local/test_export.sh
+3
-5
examples/aishell/asr0/local/test_wav.sh
examples/aishell/asr0/local/test_wav.sh
+3
-5
examples/aishell/asr0/local/train.sh
examples/aishell/asr0/local/train.sh
+2
-5
examples/aishell/asr0/run.sh
examples/aishell/asr0/run.sh
+6
-7
examples/librispeech/asr0/conf/augmentation.json
examples/librispeech/asr0/conf/augmentation.json
+0
-36
examples/librispeech/asr0/conf/deepspeech2.yaml
examples/librispeech/asr0/conf/deepspeech2.yaml
+29
-29
examples/librispeech/asr0/conf/deepspeech2_online.yaml
examples/librispeech/asr0/conf/deepspeech2_online.yaml
+28
-31
examples/librispeech/asr0/conf/preprocess.yaml
examples/librispeech/asr0/conf/preprocess.yaml
+25
-0
examples/librispeech/asr0/local/data.sh
examples/librispeech/asr0/local/data.sh
+4
-3
examples/librispeech/asr0/local/export.sh
examples/librispeech/asr0/local/export.sh
+3
-5
examples/librispeech/asr0/local/test.sh
examples/librispeech/asr0/local/test.sh
+40
-13
examples/librispeech/asr0/local/test_wav.sh
examples/librispeech/asr0/local/test_wav.sh
+3
-5
examples/librispeech/asr0/local/train.sh
examples/librispeech/asr0/local/train.sh
+2
-5
examples/librispeech/asr0/run.sh
examples/librispeech/asr0/run.sh
+11
-7
examples/tiny/asr0/conf/augmentation.json
examples/tiny/asr0/conf/augmentation.json
+0
-36
examples/tiny/asr0/conf/deepspeech2.yaml
examples/tiny/asr0/conf/deepspeech2.yaml
+21
-20
examples/tiny/asr0/conf/deepspeech2_online.yaml
examples/tiny/asr0/conf/deepspeech2_online.yaml
+19
-20
examples/tiny/asr0/conf/preprocess.yaml
examples/tiny/asr0/conf/preprocess.yaml
+25
-0
examples/tiny/asr0/local/export.sh
examples/tiny/asr0/local/export.sh
+3
-5
examples/tiny/asr0/local/test.sh
examples/tiny/asr0/local/test.sh
+3
-5
examples/tiny/asr0/local/train.sh
examples/tiny/asr0/local/train.sh
+2
-5
examples/tiny/asr0/run.sh
examples/tiny/asr0/run.sh
+3
-5
paddlespeech/cli/asr/infer.py
paddlespeech/cli/asr/infer.py
+12
-27
paddlespeech/resource/model_alias.py
paddlespeech/resource/model_alias.py
+1
-1
paddlespeech/resource/pretrained_models.py
paddlespeech/resource/pretrained_models.py
+16
-16
paddlespeech/s2t/exps/deepspeech2/bin/export.py
paddlespeech/s2t/exps/deepspeech2/bin/export.py
+1
-4
paddlespeech/s2t/exps/deepspeech2/bin/test.py
paddlespeech/s2t/exps/deepspeech2/bin/test.py
+1
-4
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+0
-3
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+1
-10
paddlespeech/s2t/exps/deepspeech2/bin/train.py
paddlespeech/s2t/exps/deepspeech2/bin/train.py
+0
-3
paddlespeech/s2t/exps/deepspeech2/model.py
paddlespeech/s2t/exps/deepspeech2/model.py
+81
-93
paddlespeech/s2t/models/ds2/conv.py
paddlespeech/s2t/models/ds2/conv.py
+14
-152
paddlespeech/s2t/models/ds2/deepspeech2.py
paddlespeech/s2t/models/ds2/deepspeech2.py
+242
-92
paddlespeech/s2t/models/ds2/rnn.py
paddlespeech/s2t/models/ds2/rnn.py
+0
-315
paddlespeech/s2t/models/ds2_online/__init__.py
paddlespeech/s2t/models/ds2_online/__init__.py
+0
-31
paddlespeech/s2t/models/ds2_online/conv.py
paddlespeech/s2t/models/ds2_online/conv.py
+0
-33
paddlespeech/s2t/models/ds2_online/deepspeech2.py
paddlespeech/s2t/models/ds2_online/deepspeech2.py
+0
-397
paddlespeech/server/engine/asr/online/asr_engine.py
paddlespeech/server/engine/asr/online/asr_engine.py
+34
-56
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+12
-9
paddlespeech/server/engine/tts/online/python/tts_engine.py
paddlespeech/server/engine/tts/online/python/tts_engine.py
+1
-1
speechx/CMakeLists.txt
speechx/CMakeLists.txt
+0
-1
speechx/examples/custom_asr/run.sh
speechx/examples/custom_asr/run.sh
+0
-1
speechx/examples/ds2_ol/README.md
speechx/examples/ds2_ol/README.md
+2
-10
speechx/examples/ds2_ol/aishell/path.sh
speechx/examples/ds2_ol/aishell/path.sh
+1
-1
speechx/examples/ds2_ol/aishell/run.sh
speechx/examples/ds2_ol/aishell/run.sh
+0
-2
speechx/examples/ds2_ol/aishell/run_fbank.sh
speechx/examples/ds2_ol/aishell/run_fbank.sh
+0
-1
speechx/examples/ds2_ol/websocket/path.sh
speechx/examples/ds2_ol/websocket/path.sh
+1
-1
speechx/examples/ds2_ol/websocket/websocket_client.sh
speechx/examples/ds2_ol/websocket/websocket_client.sh
+1
-1
speechx/examples/ds2_ol/websocket/websocket_server.sh
speechx/examples/ds2_ol/websocket/websocket_server.sh
+0
-15
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+2
-4
speechx/speechx/decoder/recognizer_main.cc
speechx/speechx/decoder/recognizer_main.cc
+2
-1
speechx/speechx/frontend/audio/audio_cache.h
speechx/speechx/frontend/audio/audio_cache.h
+3
-2
speechx/speechx/frontend/audio/compute_fbank_main.cc
speechx/speechx/frontend/audio/compute_fbank_main.cc
+5
-6
speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
...speechx/frontend/audio/compute_linear_spectrogram_main.cc
+0
-1
speechx/speechx/frontend/audio/fbank.cc
speechx/speechx/frontend/audio/fbank.cc
+20
-83
speechx/speechx/frontend/audio/fbank.h
speechx/speechx/frontend/audio/fbank.h
+15
-42
speechx/speechx/frontend/audio/feature_common.h
speechx/speechx/frontend/audio/feature_common.h
+54
-0
speechx/speechx/frontend/audio/feature_common_inl.h
speechx/speechx/frontend/audio/feature_common_inl.h
+95
-0
speechx/speechx/frontend/audio/feature_pipeline.h
speechx/speechx/frontend/audio/feature_pipeline.h
+1
-1
speechx/speechx/frontend/audio/linear_spectrogram.cc
speechx/speechx/frontend/audio/linear_spectrogram.cc
+21
-70
speechx/speechx/frontend/audio/linear_spectrogram.h
speechx/speechx/frontend/audio/linear_spectrogram.h
+22
-34
speechx/speechx/utils/CMakeLists.txt
speechx/speechx/utils/CMakeLists.txt
+1
-2
speechx/speechx/utils/simdjson.cpp
speechx/speechx/utils/simdjson.cpp
+0
-16016
speechx/speechx/utils/simdjson.h
speechx/speechx/utils/simdjson.h
+0
-37881
未找到文件。
demos/streaming_asr_server/web/templates/index.html
浏览文件 @
919c8d06
...
...
@@ -93,6 +93,7 @@
function
parseResult
(
data
)
{
var
data
=
JSON
.
parse
(
data
)
console
.
log
(
'
result json:
'
,
data
)
var
result
=
data
.
result
console
.
log
(
result
)
$
(
"
#resultPanel
"
).
html
(
result
)
...
...
docker/ubuntu18-cpu/Dockerfile
0 → 100644
浏览文件 @
919c8d06
FROM
registry.baidubce.com/paddlepaddle/paddle:2.2.2
LABEL
maintainer="paddlesl@baidu.com"
RUN
git clone
--depth
1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech
RUN
pip3 uninstall mccabe
-y
;
exit
0
;
RUN
pip3
install
multiprocess
==
0.70.12 importlib-metadata
==
4.2.0
dill
==
0.3.4
RUN
cd
/home/PaddleSpeech/audio
RUN
python setup.py bdist_wheel
RUN
cd
/home/PaddleSpeech
RUN
python setup.py bdist_wheel
RUN
pip
install
audio/dist/
*
.whl dist/
*
.whl
WORKDIR
/home/PaddleSpeech/
examples/aishell/asr0/conf/augmentation.json
已删除
100644 → 0
浏览文件 @
8b1c1ec4
[
{
"type"
:
"speed"
,
"params"
:
{
"min_speed_rate"
:
0.9
,
"max_speed_rate"
:
1.1
,
"num_rates"
:
3
},
"prob"
:
0.0
},
{
"type"
:
"shift"
,
"params"
:
{
"min_shift_ms"
:
-5
,
"max_shift_ms"
:
5
},
"prob"
:
1.0
},
{
"type"
:
"specaug"
,
"params"
:
{
"W"
:
0
,
"warp_mode"
:
"PIL"
,
"F"
:
10
,
"n_freq_masks"
:
2
,
"T"
:
50
,
"n_time_masks"
:
2
,
"p"
:
1.0
,
"adaptive_number_ratio"
:
0
,
"adaptive_size_ratio"
:
0
,
"max_n_time_masks"
:
20
,
"replace_with_zero"
:
true
},
"prob"
:
1.0
}
]
examples/aishell/asr0/conf/deepspeech2.yaml
浏览文件 @
919c8d06
...
...
@@ -15,50 +15,53 @@ max_output_input_ratio: .inf
###########################################
# Dataloader #
###########################################
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
spm_model_prefix
:
'
'
unit_type
:
'
char'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
window_ms
:
20.0
n_fft
:
None
max_freq
:
None
target_sample_rate
:
16000
use_dB_normalization
:
True
target_dB
:
-20
dither
:
1.0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
64
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
# Network Architecture #
############################################
num_conv_layers
:
2
num_rnn_layers
:
3
num_rnn_layers
:
5
rnn_layer_size
:
1024
use_gru
:
True
share_rnn_weights
:
False
rnn_direction
:
bidirect
# [forward, bidirect]
num_fc_layers
:
0
fc_layers_size_list
:
-1,
use_gru
:
False
blank_id
:
0
ctc_grad_norm_type
:
instance
###########################################
# Training #
###########################################
n_epoch
:
8
0
n_epoch
:
5
0
accum_grad
:
1
lr
:
2.0e-3
lr_decay
:
0.
8
3
lr
:
5.0e-4
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
global_grad_clip
:
3.0
log_interval
:
100
dist_sampler
:
False
log_interval
:
1
checkpoint
:
kbest_n
:
50
latest_n
:
5
examples/aishell/asr0/conf/deepspeech2_online.yaml
浏览文件 @
919c8d06
...
...
@@ -15,28 +15,26 @@ max_output_input_ratio: .inf
###########################################
# Dataloader #
###########################################
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
#linear, mfcc, fbank
spm_model_prefix
:
'
'
unit_type
:
'
char'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
window_ms
:
20.0
n_fft
:
None
max_freq
:
None
target_sample_rate
:
16000
use_dB_normalization
:
True
target_dB
:
-20
dither
:
1.0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
64
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
# Network Architecture #
...
...
@@ -54,12 +52,13 @@ blank_id: 0
###########################################
# Training #
###########################################
n_epoch
:
65
n_epoch
:
30
accum_grad
:
1
lr
:
5.0e-4
lr_decay
:
0.93
weight_decay
:
1.0e-6
global_grad_clip
:
3.0
dist_sampler
:
False
log_interval
:
100
checkpoint
:
kbest_n
:
50
...
...
examples/aishell/asr0/conf/preprocess.yaml
0 → 100644
浏览文件 @
919c8d06
process
:
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
fs
:
16000
n_mels
:
161
n_shift
:
160
win_length
:
400
dither
:
0.1
-
type
:
cmvn_json
cmvn_path
:
data/mean_std.json
# these three processes are a.k.a. SpecAugument
-
type
:
time_warp
max_time_warp
:
5
inplace
:
true
mode
:
PIL
-
type
:
freq_mask
F
:
30
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
-
type
:
time_mask
T
:
40
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
examples/aishell/asr0/conf/tuning/decode.yaml
浏览文件 @
919c8d06
...
...
@@ -2,9 +2,9 @@ decode_batch_size: 128
error_rate_type
:
cer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha
:
1.9
beta
:
5.0
beam_size
:
3
00
alpha
:
2.2
beta
:
4.3
beam_size
:
5
00
cutoff_prob
:
0.99
cutoff_top_n
:
40
num_proc_bsearch
:
10
examples/aishell/asr0/local/data.sh
浏览文件 @
919c8d06
...
...
@@ -33,12 +33,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
num_workers
=
$(
nproc
)
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--spectrum_type
=
"linear"
\
--spectrum_type
=
"fbank"
\
--feat_dim
=
161
\
--delta_delta
=
false
\
--stride_ms
=
10
\
--window_ms
=
2
0
\
--window_ms
=
2
5
\
--sample_rate
=
16000
\
--use_dB_normalization
=
Tru
e
\
--use_dB_normalization
=
Fals
e
\
--num_samples
=
2000
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
...
...
examples/aishell/asr0/local/export.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path
model_type
"
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
exit
-1
fi
...
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path
=
$1
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
model_type
=
$4
python3
-u
${
BIN_DIR
}
/export.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--export_path
${
jit_model_export_path
}
\
--model_type
${
model_type
}
--export_path
${
jit_model_export_path
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in export!"
...
...
examples/aishell/asr0/local/test.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
fi
...
...
@@ -13,7 +13,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
decode_config_path
=
$2
ckpt_prefix
=
$3
model_type
=
$4
# download language model
bash
local
/download_lm_ch.sh
...
...
@@ -23,7 +22,7 @@ fi
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# format the reference test file
python utils/format_rsl.py
\
python
3
utils/format_rsl.py
\
--origin_ref
data/manifest.test.raw
\
--trans_ref
data/manifest.test.text
...
...
@@ -32,8 +31,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--checkpoint_path
${
ckpt_prefix
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
...
...
@@ -41,20 +39,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
# format the hyp file
python utils/format_rsl.py
\
python
3
utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp
${
ckpt_prefix
}
.rsl.text
python utils/compute-wer.py
--char
=
1
--v
=
1
\
python
3
utils/compute-wer.py
--char
=
1
--v
=
1
\
data/manifest.test.text
${
ckpt_prefix
}
.rsl.text
>
${
ckpt_prefix
}
.error
fi
if
[
${
stage
}
-le
101
]
&&
[
${
stop_stage
}
-ge
101
]
;
then
python utils/format_rsl.py
\
python
3
utils/format_rsl.py
\
--origin_ref
data/manifest.test.raw
\
--trans_ref_sclite
data/manifest.test.text.sclite
python
utils/format_rsl.py
\
python3
utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp_sclite
${
ckpt_prefix
}
.rsl.text.sclite
...
...
examples/aishell/asr0/local/test_export.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
fi
...
...
@@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
decode_config_path
=
$2
jit_model_export_path
=
$3
model_type
=
$4
# download language model
bash
local
/download_lm_ch.sh
>
/dev/null 2>&1
...
...
@@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test_export.py \
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
jit_model_export_path
}
.rsl
\
--export_path
${
jit_model_export_path
}
\
--model_type
${
model_type
}
--export_path
${
jit_model_export_path
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
...
...
examples/aishell/asr0/local/test_wav.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
5
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
audio_file"
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix audio_file"
exit
-1
fi
...
...
@@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
config_path
=
$1
decode_config_path
=
$2
ckpt_prefix
=
$3
model_type
=
$4
audio_file
=
$5
audio_file
=
$4
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav
-P
data/
...
...
@@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
\
--audio_file
${
audio_file
}
if
[
$?
-ne
0
]
;
then
...
...
examples/aishell/asr0/local/train.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
3
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name
model_type
"
if
[
$#
!=
2
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
fi
...
...
@@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
ckpt_name
=
$2
model_type
=
$3
mkdir
-p
exp
...
...
@@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
else
python3
-m
paddle.distributed.launch
--gpus
=
${
CUDA_VISIBLE_DEVICES
}
${
BIN_DIR
}
/train.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
fi
...
...
examples/aishell/asr0/run.sh
浏览文件 @
919c8d06
...
...
@@ -7,8 +7,7 @@ stage=0
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
#conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
model_type
=
offline
# offline or online
avg_num
=
10
audio_file
=
data/demo_01_03.wav
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
...
@@ -25,7 +24,7 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
...
...
@@ -35,21 +34,21 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
CUDA_VISIBLE_DEVICES
=
0 ./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit||
exit
-1
fi
# Optionally, you can add LM and test it with runtime.
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
fi
examples/librispeech/asr0/conf/augmentation.json
已删除
100644 → 0
浏览文件 @
8b1c1ec4
[
{
"type"
:
"speed"
,
"params"
:
{
"min_speed_rate"
:
0.9
,
"max_speed_rate"
:
1.1
,
"num_rates"
:
3
},
"prob"
:
0.0
},
{
"type"
:
"shift"
,
"params"
:
{
"min_shift_ms"
:
-5
,
"max_shift_ms"
:
5
},
"prob"
:
1.0
},
{
"type"
:
"specaug"
,
"params"
:
{
"W"
:
0
,
"warp_mode"
:
"PIL"
,
"F"
:
10
,
"n_freq_masks"
:
2
,
"T"
:
50
,
"n_time_masks"
:
2
,
"p"
:
1.0
,
"adaptive_number_ratio"
:
0
,
"adaptive_size_ratio"
:
0
,
"max_n_time_masks"
:
20
,
"replace_with_zero"
:
true
},
"prob"
:
1.0
}
]
examples/librispeech/asr0/conf/deepspeech2.yaml
浏览文件 @
919c8d06
...
...
@@ -15,51 +15,51 @@ max_output_input_ratio: .inf
###########################################
# Dataloader #
###########################################
batch_size
:
20
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
feat_dim
:
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
spm_model_prefix
:
'
'
unit_type
:
'
char'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
161
stride_ms
:
10.0
window_ms
:
20.0
delta_delta
:
False
dither
:
1.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
64
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
# Network Architecture #
############################################
num_conv_layers
:
2
num_rnn_layers
:
3
rnn_layer_size
:
2048
num_rnn_layers
:
5
rnn_layer_size
:
1024
rnn_direction
:
bidirect
num_fc_layers
:
0
fc_layers_size_list
:
-1
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
0
###########################################
# Training #
###########################################
n_epoch
:
50
n_epoch
:
15
accum_grad
:
1
lr
:
1.0e-3
lr_decay
:
0.
8
3
lr
:
5.0e-4
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
log_interval
:
100
dist_sampler
:
False
log_interval
:
1
checkpoint
:
kbest_n
:
50
latest_n
:
5
examples/librispeech/asr0/conf/deepspeech2_online.yaml
浏览文件 @
919c8d06
...
...
@@ -15,39 +15,36 @@ max_output_input_ratio: .inf
###########################################
# Dataloader #
###########################################
batch_size
:
15
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
feat_dim
:
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
spm_model_prefix
:
'
'
unit_type
:
'
char'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
161
stride_ms
:
10.0
window_ms
:
20.0
delta_delta
:
False
dither
:
1.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
64
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
# Network Architecture #
############################################
num_conv_layers
:
2
num_rnn_layers
:
3
rnn_layer_size
:
2048
num_rnn_layers
:
5
rnn_layer_size
:
1024
rnn_direction
:
forward
num_fc_layers
:
2
fc_layers_size_list
:
512,
256
num_fc_layers
:
0
fc_layers_size_list
:
-1
use_gru
:
False
blank_id
:
0
...
...
@@ -55,13 +52,13 @@ blank_id: 0
###########################################
# Training #
###########################################
n_epoch
:
50
accum_grad
:
4
lr
:
1.0e-3
lr_decay
:
0.
8
3
n_epoch
:
65
accum_grad
:
1
lr
:
5.0e-4
lr_decay
:
0.
9
3
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
log_interval
:
1
00
log_interval
:
1
checkpoint
:
kbest_n
:
50
latest_n
:
5
examples/librispeech/asr0/conf/preprocess.yaml
0 → 100644
浏览文件 @
919c8d06
process
:
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
fs
:
16000
n_mels
:
161
n_shift
:
160
win_length
:
400
dither
:
0.1
-
type
:
cmvn_json
cmvn_path
:
data/mean_std.json
# these three processes are a.k.a. SpecAugument
-
type
:
time_warp
max_time_warp
:
5
inplace
:
true
mode
:
PIL
-
type
:
freq_mask
F
:
30
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
-
type
:
time_mask
T
:
40
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
examples/librispeech/asr0/local/data.sh
浏览文件 @
919c8d06
...
...
@@ -49,12 +49,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.train.raw"
\
--num_samples
=
2000
\
--spectrum_type
=
"linear"
\
--spectrum_type
=
"fbank"
\
--feat_dim
=
161
\
--delta_delta
=
false
\
--sample_rate
=
16000
\
--stride_ms
=
10
\
--window_ms
=
2
0
\
--use_dB_normalization
=
Tru
e
\
--window_ms
=
2
5
\
--use_dB_normalization
=
Fals
e
\
--num_workers
=
${
num_workers
}
\
--output_path
=
"data/mean_std.json"
...
...
examples/librispeech/asr0/local/export.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path
model_type
"
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
exit
-1
fi
...
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path
=
$1
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
model_type
=
$4
python3
-u
${
BIN_DIR
}
/export.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--export_path
${
jit_model_export_path
}
\
--model_type
${
model_type
}
--export_path
${
jit_model_export_path
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in export!"
...
...
examples/librispeech/asr0/local/test.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
fi
stage
=
0
stop_stage
=
100
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
...
...
@@ -11,7 +13,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
decode_config_path
=
$2
ckpt_prefix
=
$3
model_type
=
$4
# download language model
bash
local
/download_lm_en.sh
...
...
@@ -19,17 +20,43 @@ if [ $? -ne 0 ]; then
exit
1
fi
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# format the reference test file
python3 utils/format_rsl.py
\
--origin_ref
data/manifest.test-clean.raw
\
--trans_ref
data/manifest.test-clean.text
if
[
$?
-ne
0
]
;
then
python3
-u
${
BIN_DIR
}
/test.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
exit
1
fi
python3 utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp
${
ckpt_prefix
}
.rsl.text
python3 utils/compute-wer.py
--char
=
1
--v
=
1
\
data/manifest.test-clean.text
${
ckpt_prefix
}
.rsl.text
>
${
ckpt_prefix
}
.error
fi
if
[
${
stage
}
-le
101
]
&&
[
${
stop_stage
}
-ge
101
]
;
then
python3 utils/format_rsl.py
\
--origin_ref
data/manifest.test-clean.raw
\
--trans_ref_sclite
data/manifest.test.text-clean.sclite
python3 utils/format_rsl.py
\
--origin_hyp
${
ckpt_prefix
}
.rsl
\
--trans_hyp_sclite
${
ckpt_prefix
}
.rsl.text.sclite
mkdir
-p
${
ckpt_prefix
}
_sclite
sclite
-i
wsj
-r
data/manifest.test-clean.text.sclite
-h
${
ckpt_prefix
}
.rsl.text.sclite
-e
utf-8
-o
all
-O
${
ckpt_prefix
}
_sclite
-c
NOASCII
fi
...
...
examples/librispeech/asr0/local/test_wav.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
5
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
audio_file"
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix audio_file"
exit
-1
fi
...
...
@@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
config_path
=
$1
decode_config_path
=
$2
ckpt_prefix
=
$3
model_type
=
$4
audio_file
=
$5
audio_file
=
$4
mkdir
-p
data
wget
-nc
https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav
-P
data/
...
...
@@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
\
--audio_file
${
audio_file
}
if
[
$?
-ne
0
]
;
then
...
...
examples/librispeech/asr0/local/train.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
3
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name
model_type
"
if
[
$#
!=
2
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
fi
...
...
@@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
ckpt_name
=
$2
model_type
=
$3
mkdir
-p
exp
...
...
@@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
else
python3
-m
paddle.distributed.launch
--gpus
=
${
CUDA_VISIBLE_DEVICES
}
${
BIN_DIR
}
/train.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--seed
${
seed
}
fi
...
...
examples/librispeech/asr0/run.sh
浏览文件 @
919c8d06
...
...
@@ -2,13 +2,12 @@
set
-e
source
path.sh
gpus
=
0,1,2,3
,4,5,6,7
gpus
=
0,1,2,3
stage
=
0
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
30
model_type
=
offline
avg_num
=
5
audio_file
=
data/demo_002_en.wav
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
...
@@ -24,7 +23,7 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
...
...
@@ -34,15 +33,20 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/test_export.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit||
exit
-1
fi
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
# test a single .wav file
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
${
audio_file
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test_wav.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
audio_file
}
||
exit
-1
fi
examples/tiny/asr0/conf/augmentation.json
已删除
100644 → 0
浏览文件 @
8b1c1ec4
[
{
"type"
:
"speed"
,
"params"
:
{
"min_speed_rate"
:
0.9
,
"max_speed_rate"
:
1.1
,
"num_rates"
:
3
},
"prob"
:
0.0
},
{
"type"
:
"shift"
,
"params"
:
{
"min_shift_ms"
:
-5
,
"max_shift_ms"
:
5
},
"prob"
:
1.0
},
{
"type"
:
"specaug"
,
"params"
:
{
"W"
:
5
,
"warp_mode"
:
"PIL"
,
"F"
:
30
,
"n_freq_masks"
:
2
,
"T"
:
40
,
"n_time_masks"
:
2
,
"p"
:
1.0
,
"adaptive_number_ratio"
:
0
,
"adaptive_size_ratio"
:
0
,
"max_n_time_masks"
:
20
,
"replace_with_zero"
:
true
},
"prob"
:
1.0
}
]
examples/tiny/asr0/conf/deepspeech2.yaml
浏览文件 @
919c8d06
...
...
@@ -16,28 +16,26 @@ max_output_input_ratio: 10.0
###########################################
# Dataloader #
###########################################
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
spm_model_prefix
:
'
'
unit_type
:
'
char'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
window_ms
:
20.0
n_fft
:
None
max_freq
:
None
target_sample_rate
:
16000
use_dB_normalization
:
True
target_dB
:
-20
dither
:
1.0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
2
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
4
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
# Network Architecture #
...
...
@@ -45,8 +43,10 @@ batch_size: 4
num_conv_layers
:
2
num_rnn_layers
:
3
rnn_layer_size
:
2048
rnn_direction
:
bidirect
# [forward, bidirect]
num_fc_layers
:
0
fc_layers_size_list
:
-1,
use_gru
:
False
share_rnn_weights
:
True
blank_id
:
0
...
...
@@ -59,6 +59,7 @@ lr: 1.0e-5
lr_decay
:
0.8
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
dist_sampler
:
False
log_interval
:
1
checkpoint
:
kbest_n
:
3
...
...
examples/tiny/asr0/conf/deepspeech2_online.yaml
浏览文件 @
919c8d06
...
...
@@ -16,28 +16,26 @@ max_output_input_ratio: 10.0
###########################################
# Dataloader #
###########################################
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/lang_char/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
spm_model_prefix
:
'
'
unit_type
:
'
char'
preprocess_config
:
conf/preprocess.yaml
feat_dim
:
161
delta_delta
:
False
stride_ms
:
10.0
window_ms
:
20.0
n_fft
:
None
max_freq
:
None
target_sample_rate
:
16000
use_dB_normalization
:
True
target_dB
:
-20
dither
:
1.0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
window_ms
:
25.0
sortagrad
:
0
# Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size
:
4
maxlen_in
:
512
# if input length > maxlen-in, batchsize is automatically reduced
maxlen_out
:
150
# if output length > maxlen-out, batchsize is automatically reduced
minibatches
:
0
# for debug
batch_count
:
auto
batch_bins
:
0
batch_frames_in
:
0
batch_frames_out
:
0
batch_frames_inout
:
0
num_workers
:
8
subsampling_factor
:
1
num_encs
:
1
############################################
# Network Architecture #
...
...
@@ -61,6 +59,7 @@ lr: 1.0e-5
lr_decay
:
1.0
weight_decay
:
1.0e-6
global_grad_clip
:
5.0
dist_sampler
:
False
log_interval
:
1
checkpoint
:
kbest_n
:
3
...
...
examples/tiny/asr0/conf/preprocess.yaml
0 → 100644
浏览文件 @
919c8d06
process
:
# extract kaldi fbank from PCM
-
type
:
fbank_kaldi
fs
:
16000
n_mels
:
161
n_shift
:
160
win_length
:
400
dither
:
0.1
-
type
:
cmvn_json
cmvn_path
:
data/mean_std.json
# these three processes are a.k.a. SpecAugument
-
type
:
time_warp
max_time_warp
:
5
inplace
:
true
mode
:
PIL
-
type
:
freq_mask
F
:
30
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
-
type
:
time_mask
T
:
40
n_mask
:
2
inplace
:
true
replace_with_zero
:
false
examples/tiny/asr0/local/export.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path
model_type
"
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
exit
-1
fi
...
...
@@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path
=
$1
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
model_type
=
$4
python3
-u
${
BIN_DIR
}
/export.py
\
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--export_path
${
jit_model_export_path
}
\
--model_type
${
model_type
}
--export_path
${
jit_model_export_path
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in export!"
...
...
examples/tiny/asr0/local/test.sh
浏览文件 @
919c8d06
#!/bin/bash
if
[
$#
!=
4
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix
model_type
"
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path decode_config_path ckpt_path_prefix"
exit
-1
fi
...
...
@@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
config_path
=
$1
decode_config_path
=
$2
ckpt_prefix
=
$3
model_type
=
$4
# download language model
bash
local
/download_lm_en.sh
...
...
@@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test.py \
--config
${
config_path
}
\
--decode_cfg
${
decode_config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
--checkpoint_path
${
ckpt_prefix
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
...
...
examples/tiny/asr0/local/train.sh
浏览文件 @
919c8d06
...
...
@@ -15,14 +15,13 @@ if [ ${seed} != 0 ]; then
echo
"using seed
$seed
& FLAGS_cudnn_deterministic=True ..."
fi
if
[
$#
!=
3
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name
model_type
"
if
[
$#
!=
2
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
fi
config_path
=
$1
ckpt_name
=
$2
model_type
=
$3
mkdir
-p
exp
...
...
@@ -31,7 +30,6 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--profiler-options
"
${
profiler_options
}
"
\
--seed
${
seed
}
else
...
...
@@ -39,7 +37,6 @@ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/t
--ngpu
${
ngpu
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--profiler-options
"
${
profiler_options
}
"
\
--seed
${
seed
}
fi
...
...
examples/tiny/asr0/run.sh
浏览文件 @
919c8d06
...
...
@@ -8,8 +8,6 @@ stop_stage=100
conf_path
=
conf/deepspeech2.yaml
decode_conf_path
=
conf/tuning/decode.yaml
avg_num
=
1
model_type
=
offline
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
avg_ckpt
=
avg_
${
avg_num
}
...
...
@@ -23,7 +21,7 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
...
...
@@ -33,10 +31,10 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
${
decode_conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
paddlespeech/cli/asr/infer.py
浏览文件 @
919c8d06
...
...
@@ -138,6 +138,7 @@ class ASRExecutor(BaseExecutor):
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
self
.
task_resource
.
set_task_model
(
tag
,
version
=
None
)
self
.
res_path
=
self
.
task_resource
.
res_dir
self
.
cfg_path
=
os
.
path
.
join
(
self
.
res_path
,
self
.
task_resource
.
res_dict
[
'cfg_path'
])
self
.
ckpt_path
=
os
.
path
.
join
(
...
...
@@ -158,15 +159,18 @@ class ASRExecutor(BaseExecutor):
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
vocab
=
self
.
config
.
vocab_filepath
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
if
"deepspeech2"
in
model_type
:
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
self
.
download_lm
(
...
...
@@ -174,12 +178,6 @@ class ASRExecutor(BaseExecutor):
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
config
.
decode
.
decoding_method
=
decode_method
else
:
...
...
@@ -222,19 +220,7 @@ class ASRExecutor(BaseExecutor):
logger
.
info
(
"Preprocess audio_file:"
+
audio_file
)
# Get the object for feature extraction
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
audio
,
_
=
self
.
collate_fn_test
.
process_utterance
(
audio_file
=
audio_file
,
transcript
=
" "
)
audio_len
=
audio
.
shape
[
0
]
audio
=
paddle
.
to_tensor
(
audio
,
dtype
=
'float32'
)
audio_len
=
paddle
.
to_tensor
(
audio_len
)
audio
=
paddle
.
unsqueeze
(
audio
,
axis
=
0
)
# vocab_list = collate_fn_test.vocab_list
self
.
_inputs
[
"audio"
]
=
audio
self
.
_inputs
[
"audio_len"
]
=
audio_len
logger
.
info
(
f
"audio feat shape:
{
audio
.
shape
}
"
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
if
"deepspeech2"
in
model_type
or
"conformer"
in
model_type
or
"transformer"
in
model_type
:
logger
.
info
(
"get the preprocess conf"
)
preprocess_conf
=
self
.
config
.
preprocess_config
preprocess_args
=
{
"train"
:
False
}
...
...
@@ -242,7 +228,6 @@ class ASRExecutor(BaseExecutor):
logger
.
info
(
"read the audio file"
)
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
if
self
.
change_format
:
if
audio
.
shape
[
1
]
>=
2
:
audio
=
audio
.
mean
(
axis
=
1
,
dtype
=
np
.
int16
)
...
...
@@ -285,7 +270,7 @@ class ASRExecutor(BaseExecutor):
cfg
=
self
.
config
.
decode
audio
=
self
.
_inputs
[
"audio"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
if
"deepspeech2
online"
in
model_type
or
"deepspeech2offline
"
in
model_type
:
if
"deepspeech2"
in
model_type
:
decode_batch_size
=
audio
.
shape
[
0
]
self
.
model
.
decoder
.
init_decoder
(
decode_batch_size
,
self
.
text_feature
.
vocab_list
,
...
...
paddlespeech/resource/model_alias.py
浏览文件 @
919c8d06
...
...
@@ -23,7 +23,7 @@ model_alias = {
# ---------------------------------
"deepspeech2offline"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"deepspeech2online"
:
[
"paddlespeech.s2t.models.ds2
_online:DeepSpeech2ModelOnline
"
],
[
"paddlespeech.s2t.models.ds2
:DeepSpeech2Model
"
],
"conformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer_online"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"transformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
...
...
paddlespeech/resource/pretrained_models.py
浏览文件 @
919c8d06
...
...
@@ -136,9 +136,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_wenetspeech-zh-16k"
:
{
'1.0'
:
{
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.
0a
.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.
1
.model.tar.gz'
,
'md5'
:
'
e393d4d274af0f6967db24fc146e8074
'
,
'
d1be86a3e786042ab64f05161b5fae62
'
,
'cfg_path'
:
'model.yaml'
,
'ckpt_path'
:
...
...
@@ -152,13 +152,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_aishell-zh-16k"
:
{
'1.0'
:
{
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
aishell_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
offline_aishell_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'
932c3593d62fe5c741b59b31318aa314
'
,
'
4d26066c6f19f52087425dc722ae5b13
'
,
'cfg_path'
:
'model.yaml'
,
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_1'
,
'exp/deepspeech2/checkpoints/avg_1
0
'
,
'lm_url'
:
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'lm_md5'
:
...
...
@@ -168,9 +168,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_aishell-zh-16k"
:
{
'1.0'
:
{
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_
0.2
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_
1.0
.1.model.tar.gz'
,
'md5'
:
'
98b87b171b7240b7cae6e07d8d0bc9be
'
,
'
df5ddeac8b679a470176649ac4b78726
'
,
'cfg_path'
:
'model.yaml'
,
'ckpt_path'
:
...
...
@@ -188,13 +188,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_librispeech-en-16k"
:
{
'1.0'
:
{
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_
librispeech_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_
offline_librispeech_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'
f5666c81ad015c8de03aac2bc92e5762
'
,
'
ed9e2b008a65268b3484020281ab048c
'
,
'cfg_path'
:
'model.yaml'
,
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_
1
'
,
'exp/deepspeech2/checkpoints/avg_
5
'
,
'lm_url'
:
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm'
,
'lm_md5'
:
...
...
@@ -207,17 +207,17 @@ asr_static_pretrained_models = {
"deepspeech2offline_aishell-zh-16k"
:
{
'1.0'
:
{
'url'
:
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
aishell_ckpt_0.1
.1.model.tar.gz'
,
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_
offline_aishell_ckpt_1.0
.1.model.tar.gz'
,
'md5'
:
'
932c3593d62fe5c741b59b31318aa314
'
,
'
4d26066c6f19f52087425dc722ae5b13
'
,
'cfg_path'
:
'model.yaml'
,
'ckpt_path'
:
'exp/deepspeech2/checkpoints/avg_1'
,
'exp/deepspeech2/checkpoints/avg_1
0
'
,
'model'
:
'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel'
,
'exp/deepspeech2/checkpoints/avg_1
0
.jit.pdmodel'
,
'params'
:
'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams'
,
'exp/deepspeech2/checkpoints/avg_1
0
.jit.pdiparams'
,
'lm_url'
:
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
,
'lm_md5'
:
...
...
paddlespeech/s2t/exps/deepspeech2/bin/export.py
浏览文件 @
919c8d06
...
...
@@ -35,8 +35,6 @@ if __name__ == "__main__":
# save jit model to
parser
.
add_argument
(
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
"offline/online"
)
parser
.
add_argument
(
'--nxpu'
,
type
=
int
,
...
...
@@ -44,7 +42,6 @@ if __name__ == "__main__":
choices
=
[
0
,
1
],
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
print_arguments
(
args
)
# https://yaml.org/type/float.html
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test.py
浏览文件 @
919c8d06
...
...
@@ -32,8 +32,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
# save asr result to
parser
.
add_argument
(
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
...
...
@@ -45,7 +43,6 @@ if __name__ == "__main__":
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
浏览文件 @
919c8d06
...
...
@@ -38,8 +38,6 @@ if __name__ == "__main__":
#load jit model from
parser
.
add_argument
(
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
'--nxpu'
,
type
=
int
,
...
...
@@ -50,7 +48,6 @@ if __name__ == "__main__":
"--enable-auto-log"
,
action
=
"store_true"
,
help
=
"use auto log"
)
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
浏览文件 @
919c8d06
...
...
@@ -23,7 +23,6 @@ from yacs.config import CfgNode
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.training.cli
import
default_argument_parser
from
paddlespeech.s2t.utils
import
mp_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
...
...
@@ -113,12 +112,7 @@ class DeepSpeech2Tester_hub():
config
.
input_dim
=
self
.
collate_fn_test
.
feature_size
config
.
output_dim
=
self
.
collate_fn_test
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
raise
Exception
(
"wrong model type"
)
self
.
model
=
model
...
...
@@ -172,8 +166,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
"--audio_file"
,
type
=
str
,
help
=
'audio file path'
)
# save asr result to
parser
.
add_argument
(
...
...
@@ -184,7 +176,6 @@ if __name__ == "__main__":
print
(
"Please input the audio file path"
)
sys
.
exit
(
-
1
)
check
(
args
.
audio_file
)
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
# https://yaml.org/type/float.html
config
=
CfgNode
(
new_allowed
=
True
)
...
...
paddlespeech/s2t/exps/deepspeech2/bin/train.py
浏览文件 @
919c8d06
...
...
@@ -31,8 +31,6 @@ def main(config, args):
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
'offline'
,
help
=
'offline/online'
)
parser
.
add_argument
(
'--nxpu'
,
type
=
int
,
...
...
@@ -40,7 +38,6 @@ if __name__ == "__main__":
choices
=
[
0
,
1
],
help
=
"if nxpu == 0 and ngpu == 0, use cpu."
)
args
=
parser
.
parse_args
()
print
(
"model_type:{}"
.
format
(
args
.
model_type
))
print_arguments
(
args
,
globals
())
# https://yaml.org/type/float.html
...
...
paddlespeech/s2t/exps/deepspeech2/model.py
浏览文件 @
919c8d06
...
...
@@ -23,16 +23,12 @@ import paddle
from
paddle
import
distributed
as
dist
from
paddle
import
inference
from
paddle.io
import
DataLoader
from
paddlespeech.s2t.io.dataloader
import
BatchDataLoader
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.io.collator
import
SpeechCollator
from
paddlespeech.s2t.io.dataset
import
ManifestDataset
from
paddlespeech.s2t.io.sampler
import
SortagradBatchSampler
from
paddlespeech.s2t.io.sampler
import
SortagradDistributedBatchSampler
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2InferModel
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2InferModelOnline
from
paddlespeech.s2t.models.ds2_online
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.training.gradclip
import
ClipGradByGlobalNormWithLog
from
paddlespeech.s2t.training.reporter
import
report
from
paddlespeech.s2t.training.timer
import
Timer
...
...
@@ -136,18 +132,13 @@ class DeepSpeech2Trainer(Trainer):
config
=
self
.
config
.
clone
()
with
UpdateConfig
(
config
):
if
self
.
train
:
config
.
input_dim
=
self
.
train_loader
.
collate_fn
.
feature_size
config
.
output_dim
=
self
.
train_loader
.
collate_fn
.
vocab_size
config
.
input_dim
=
self
.
train_loader
.
feat_dim
config
.
output_dim
=
self
.
train_loader
.
vocab_size
else
:
config
.
input_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
config
.
output_dim
=
self
.
test_loader
.
collate_fn
.
vocab_size
config
.
input_dim
=
self
.
test_loader
.
feat_dim
config
.
output_dim
=
self
.
test_loader
.
vocab_size
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
.
from_config
(
config
)
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
.
from_config
(
config
)
else
:
raise
Exception
(
"wrong model type"
)
if
self
.
parallel
:
model
=
paddle
.
DataParallel
(
model
)
...
...
@@ -175,76 +166,81 @@ class DeepSpeech2Trainer(Trainer):
config
=
self
.
config
.
clone
()
config
.
defrost
()
if
self
.
train
:
# train
config
.
manifest
=
config
.
train_manifest
train_dataset
=
ManifestDataset
.
from_config
(
config
)
if
self
.
parallel
:
batch_sampler
=
SortagradDistributedBatchSampler
(
train_dataset
,
batch_size
=
config
.
batch_size
,
num_replicas
=
None
,
rank
=
None
,
shuffle
=
True
,
drop_last
=
True
,
# train/valid dataset, return token ids
self
.
train_loader
=
BatchDataLoader
(
json_file
=
config
.
train_manifest
,
train_mode
=
True
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
shuffle_method
)
else
:
batch_sampler
=
SortagradBatchSampler
(
train_dataset
,
shuffle
=
True
,
batch_size
=
config
.
batch_size
,
drop_last
=
True
,
sortagrad
=
config
.
sortagrad
,
shuffle_method
=
config
.
shuffle_method
)
config
.
keep_transcription_text
=
False
collate_fn_train
=
SpeechCollator
.
from_config
(
config
)
self
.
train_loader
=
DataLoader
(
train_dataset
,
batch_sampler
=
batch_sampler
,
collate_fn
=
collate_fn_train
,
num_workers
=
config
.
num_workers
)
# dev
config
.
manifest
=
config
.
dev_manifest
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
augmentation_config
=
""
config
.
keep_transcription_text
=
False
collate_fn_dev
=
SpeechCollator
.
from_config
(
config
)
self
.
valid_loader
=
DataLoader
(
dev_dataset
,
batch_size
=
int
(
config
.
batch_size
),
shuffle
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_dev
,
num_workers
=
config
.
num_workers
)
maxlen_in
=
config
.
maxlen_in
,
maxlen_out
=
config
.
maxlen_out
,
minibatches
=
config
.
minibatches
,
mini_batch_size
=
self
.
args
.
ngpu
,
batch_count
=
config
.
batch_count
,
batch_bins
=
config
.
batch_bins
,
batch_frames_in
=
config
.
batch_frames_in
,
batch_frames_out
=
config
.
batch_frames_out
,
batch_frames_inout
=
config
.
batch_frames_inout
,
preprocess_conf
=
config
.
preprocess_config
,
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
num_encs
=
1
,
dist_sampler
=
config
.
get
(
'dist_sampler'
,
False
),
shortest_first
=
False
)
self
.
valid_loader
=
BatchDataLoader
(
json_file
=
config
.
dev_manifest
,
train_mode
=
False
,
sortagrad
=
False
,
batch_size
=
config
.
batch_size
,
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
mini_batch_size
=
self
.
args
.
ngpu
,
batch_count
=
'auto'
,
batch_bins
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
preprocess_config
,
n_iter_processes
=
config
.
num_workers
,
subsampling_factor
=
1
,
num_encs
=
1
,
dist_sampler
=
config
.
get
(
'dist_sampler'
,
False
),
shortest_first
=
False
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
else
:
# test
config
.
manifest
=
config
.
test_manifest
test_dataset
=
ManifestDataset
.
from_config
(
config
)
config
.
augmentation_config
=
""
config
.
keep_transcription_text
=
True
collate_fn_test
=
SpeechCollator
.
from_config
(
config
)
decode_batch_size
=
config
.
get
(
'decode'
,
dict
()).
get
(
'decode_batch_size'
,
1
)
self
.
test_loader
=
DataLoader
(
test_dataset
,
# test dataset, return raw text
self
.
test_loader
=
BatchDataLoader
(
json_file
=
config
.
test_manifest
,
train_mode
=
False
,
sortagrad
=
False
,
batch_size
=
decode_batch_size
,
shuffle
=
False
,
drop_last
=
False
,
collate_fn
=
collate_fn_test
,
num_workers
=
config
.
num_workers
)
logger
.
info
(
"Setup test Dataloader!"
)
maxlen_in
=
float
(
'inf'
),
maxlen_out
=
float
(
'inf'
),
minibatches
=
0
,
mini_batch_size
=
1
,
batch_count
=
'auto'
,
batch_bins
=
0
,
batch_frames_in
=
0
,
batch_frames_out
=
0
,
batch_frames_inout
=
0
,
preprocess_conf
=
config
.
preprocess_config
,
n_iter_processes
=
1
,
subsampling_factor
=
1
,
num_encs
=
1
)
logger
.
info
(
"Setup test/align Dataloader!"
)
class
DeepSpeech2Tester
(
DeepSpeech2Trainer
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
unit_type
,
vocab
=
None
)
unit_type
=
config
.
unit_type
,
vocab
=
config
.
vocab_filepath
)
self
.
vocab_list
=
self
.
_text_featurizer
.
vocab_list
def
ordid2token
(
self
,
texts
,
texts_len
):
""" ord() id to chr() chr """
...
...
@@ -252,7 +248,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
for
text
,
n
in
zip
(
texts
,
texts_len
):
n
=
n
.
numpy
().
item
()
ids
=
text
[:
n
]
trans
.
append
(
''
.
join
([
chr
(
i
)
for
i
in
ids
]
))
trans
.
append
(
self
.
_text_featurizer
.
defeaturize
(
ids
.
numpy
().
tolist
()
))
return
trans
def
compute_metrics
(
self
,
...
...
@@ -307,8 +303,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
# Initialized the decoder in model
decode_cfg
=
self
.
config
.
decode
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
decode_batch_size
=
self
.
test_loader
.
batch_size
vocab_list
=
self
.
vocab_list
decode_batch_size
=
decode_cfg
.
decode_
batch_size
self
.
model
.
decoder
.
init_decoder
(
decode_batch_size
,
vocab_list
,
decode_cfg
.
decoding_method
,
decode_cfg
.
lang_model_path
,
decode_cfg
.
alpha
,
decode_cfg
.
beta
,
...
...
@@ -338,17 +334,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
@
paddle
.
no_grad
()
def
export
(
self
):
if
self
.
args
.
model_type
==
'offline'
:
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
elif
self
.
args
.
model_type
==
'online'
:
infer_model
=
DeepSpeech2InferModelOnline
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
else
:
raise
Exception
(
"wrong model type"
)
infer_model
.
eval
()
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
static_model
=
infer_model
.
export
()
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
...
...
@@ -376,10 +364,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
# Initialized the decoder in model
decode_cfg
=
self
.
config
.
decode
vocab_list
=
self
.
test_loader
.
collate_fn
.
vocab_list
if
self
.
args
.
model_type
==
"online
"
:
vocab_list
=
self
.
vocab_list
if
self
.
config
.
rnn_direction
==
"forward
"
:
decode_batch_size
=
1
elif
self
.
args
.
model_type
==
"offline
"
:
elif
self
.
config
.
rnn_direction
==
"bidirect
"
:
decode_batch_size
=
self
.
test_loader
.
batch_size
else
:
raise
Exception
(
"wrong model type"
)
...
...
@@ -412,11 +400,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self
.
model
.
decoder
.
del_decoder
()
def
compute_result_transcripts
(
self
,
audio
,
audio_len
):
if
self
.
args
.
model_type
==
"online
"
:
if
self
.
config
.
rnn_direction
==
"forward
"
:
output_probs
,
output_lens
,
trans_batch
=
self
.
static_forward_online
(
audio
,
audio_len
,
decoder_chunk_size
=
1
)
result_transcripts
=
[
trans
[
-
1
]
for
trans
in
trans_batch
]
elif
self
.
args
.
model_type
==
"offline
"
:
elif
self
.
config
.
rnn_direction
==
"bidirect
"
:
output_probs
,
output_lens
=
self
.
static_forward_offline
(
audio
,
audio_len
)
batch_size
=
output_probs
.
shape
[
0
]
...
...
paddlespeech/s2t/models/ds2/conv.py
浏览文件 @
919c8d06
...
...
@@ -11,161 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
import
paddle
from
paddlespeech.s2t.modules.activation
import
brelu
from
paddlespeech.s2t.modules.mask
import
make_non_pad_mask
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.s2t.modules.subsampling
import
Conv2dSubsampling4
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'ConvStack'
,
"conv_output_size"
]
class
Conv2dSubsampling4Pure
(
Conv2dSubsampling4
):
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
):
super
().
__init__
(
idim
,
odim
,
dropout_rate
,
None
)
self
.
output_dim
=
((
idim
-
1
)
//
2
-
1
)
//
2
*
odim
self
.
receptive_field_length
=
2
*
(
3
-
1
)
+
3
# stride_1 * (kernel_size_2 - 1) + kerel_size_1
def
conv_output_size
(
I
,
F
,
P
,
S
):
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# Output size after Conv:
# By noting I the length of the input volume size,
# F the length of the filter,
# P the amount of zero padding,
# S the stride,
# then the output size O of the feature map along that dimension is given by:
# O = (I - F + Pstart + Pend) // S + 1
# When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
# When Pstart == Pend == 0
# O = (I - F - S) // S
# https://iq.opengenus.org/output-size-of-convolution/
# Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
# Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
return
(
I
-
F
+
2
*
P
-
S
)
//
S
# receptive field calculator
# https://fomoro.com/research/article/receptive-field-calculator
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://distill.pub/2019/computing-receptive-fields/
# Rl-1 = Sl * Rl + (Kl - Sl)
class
ConvBn
(
nn
.
Layer
):
"""Convolution layer with batch normalization.
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type kernel_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:param num_channels_out: Number of output channels.
:type num_channels_out: int
:param stride: The x dimension of the stride. Or input a tuple for two
image dimension.
:type stride: int|tuple|list
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type, relu|brelu
:type act: string
:return: Batch norm layer after convolution layer.
:rtype: Variable
"""
def
__init__
(
self
,
num_channels_in
,
num_channels_out
,
kernel_size
,
stride
,
padding
,
act
):
super
().
__init__
()
assert
len
(
kernel_size
)
==
2
assert
len
(
stride
)
==
2
assert
len
(
padding
)
==
2
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
padding
=
padding
self
.
conv
=
nn
.
Conv2D
(
num_channels_in
,
num_channels_out
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
weight_attr
=
None
,
bias_attr
=
False
,
data_format
=
'NCHW'
)
self
.
bn
=
nn
.
BatchNorm2D
(
num_channels_out
,
weight_attr
=
None
,
bias_attr
=
None
,
data_format
=
'NCHW'
)
self
.
act
=
F
.
relu
if
act
==
'relu'
else
brelu
def
forward
(
self
,
x
,
x_len
):
"""
x(Tensor): audio, shape [B, C, D, T]
"""
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
)
->
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
x
=
self
.
act
(
x
)
x_len
=
(
x_len
-
self
.
kernel_size
[
1
]
+
2
*
self
.
padding
[
1
]
)
//
self
.
stride
[
1
]
+
1
# reset padding part to 0
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
1
).
unsqueeze
(
1
)
# [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
class
ConvStack
(
nn
.
Layer
):
"""Convolution group with stacked convolution layers.
:param feat_size: audio feature dim.
:type feat_size: int
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
"""
def
__init__
(
self
,
feat_size
,
num_stacks
):
super
().
__init__
()
self
.
feat_size
=
feat_size
# D
self
.
num_stacks
=
num_stacks
self
.
conv_in
=
ConvBn
(
num_channels_in
=
1
,
num_channels_out
=
32
,
kernel_size
=
(
41
,
11
),
#[D, T]
stride
=
(
2
,
3
),
padding
=
(
20
,
5
),
act
=
'brelu'
)
out_channel
=
32
convs
=
[
ConvBn
(
num_channels_in
=
32
,
num_channels_out
=
out_channel
,
kernel_size
=
(
21
,
11
),
stride
=
(
2
,
1
),
padding
=
(
10
,
5
),
act
=
'brelu'
)
for
i
in
range
(
num_stacks
-
1
)
]
self
.
conv_stack
=
nn
.
LayerList
(
convs
)
# conv output feat_dim
output_height
=
(
feat_size
-
1
)
//
2
+
1
for
i
in
range
(
self
.
num_stacks
-
1
):
output_height
=
(
output_height
-
1
)
//
2
+
1
self
.
output_height
=
out_channel
*
output_height
def
forward
(
self
,
x
,
x_len
):
"""
x: shape [B, C, D, T]
x_len : shape [B]
"""
x
,
x_len
=
self
.
conv_in
(
x
,
x_len
)
for
i
,
conv
in
enumerate
(
self
.
conv_stack
):
x
,
x_len
=
conv
(
x
,
x_len
)
#b, c, t, f = paddle.shape(x) #not work under jit
x
=
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
0
,
0
,
-
1
])
x_len
=
((
x_len
-
1
)
//
2
-
1
)
//
2
return
x
,
x_len
paddlespeech/s2t/models/ds2/deepspeech2.py
浏览文件 @
919c8d06
...
...
@@ -13,15 +13,14 @@
# limitations under the License.
"""Deepspeech2 ASR Model"""
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddlespeech.s2t.models.ds2.conv
import
ConvStack
from
paddlespeech.s2t.models.ds2.rnn
import
RNNStack
from
paddlespeech.s2t.models.ds2.conv
import
Conv2dSubsampling4Pure
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'DeepSpeech2Model'
,
'DeepSpeech2InferModel'
]
...
...
@@ -32,72 +31,197 @@ class CRNNEncoder(nn.Layer):
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
use_gru
=
False
,
share_rnn_weights
=
True
):
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
):
super
().
__init__
()
self
.
rnn_size
=
rnn_size
self
.
feat_size
=
feat_size
# 161 for linear
self
.
dict_size
=
dict_size
self
.
conv
=
ConvStack
(
feat_size
,
num_conv_layers
)
i_size
=
self
.
conv
.
output_height
# H after conv stack
self
.
rnn
=
RNNStack
(
i_size
=
i_size
,
h_size
=
rnn_size
,
num_stacks
=
num_rnn_layers
,
use_gru
=
use_gru
,
share_rnn_weights
=
share_rnn_weights
)
self
.
num_rnn_layers
=
num_rnn_layers
self
.
num_fc_layers
=
num_fc_layers
self
.
rnn_direction
=
rnn_direction
self
.
fc_layers_size_list
=
fc_layers_size_list
self
.
use_gru
=
use_gru
self
.
conv
=
Conv2dSubsampling4Pure
(
feat_size
,
32
,
dropout_rate
=
0.0
)
self
.
output_dim
=
self
.
conv
.
output_dim
i_size
=
self
.
conv
.
output_dim
self
.
rnn
=
nn
.
LayerList
()
self
.
layernorm_list
=
nn
.
LayerList
()
self
.
fc_layers_list
=
nn
.
LayerList
()
if
rnn_direction
==
'bidirect'
or
rnn_direction
==
'bidirectional'
:
layernorm_size
=
2
*
rnn_size
elif
rnn_direction
==
'forward'
:
layernorm_size
=
rnn_size
else
:
raise
Exception
(
"Wrong rnn direction"
)
for
i
in
range
(
0
,
num_rnn_layers
):
if
i
==
0
:
rnn_input_size
=
i_size
else
:
rnn_input_size
=
layernorm_size
if
use_gru
is
True
:
self
.
rnn
.
append
(
nn
.
GRU
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
else
:
self
.
rnn
.
append
(
nn
.
LSTM
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
self
.
output_dim
=
layernorm_size
fc_input_size
=
layernorm_size
for
i
in
range
(
self
.
num_fc_layers
):
self
.
fc_layers_list
.
append
(
nn
.
Linear
(
fc_input_size
,
fc_layers_size_list
[
i
]))
fc_input_size
=
fc_layers_size_list
[
i
]
self
.
output_dim
=
fc_layers_size_list
[
i
]
@
property
def
output_size
(
self
):
return
self
.
rnn_size
*
2
return
self
.
output_dim
def
forward
(
self
,
audio
,
audio_len
):
def
forward
(
self
,
x
,
x_lens
,
init_state_h_box
=
None
,
init_state_c_box
=
None
):
"""Compute Encoder outputs
Args:
audio (Tensor): [B, Tmax
, D]
text (Tensor): [B, Umax
]
audio_len (Tensor): [B
]
text_len (Tensor): [B
]
Return
s
:
x (Tensor): [B, T
, D]
x_lens (Tensor): [B
]
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size
]
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size
]
Return:
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
# [B, T, D] -> [B, D, T]
audio
=
audio
.
transpose
([
0
,
2
,
1
])
# [B, D, T] -> [B, C=1, D, T]
x
=
audio
.
unsqueeze
(
1
)
x_lens
=
audio_len
if
init_state_h_box
is
not
None
:
init_state_list
=
None
if
self
.
use_gru
is
True
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
init_state_h_list
else
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
[(
init_state_h_list
[
i
],
init_state_c_list
[
i
])
for
i
in
range
(
self
.
num_rnn_layers
)]
else
:
init_state_list
=
[
None
]
*
self
.
num_rnn_layers
# convolution group
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
final_chunk_state_list
=
[]
for
i
in
range
(
0
,
self
.
num_rnn_layers
):
x
,
final_state
=
self
.
rnn
[
i
](
x
,
init_state_list
[
i
],
x_lens
)
#[B, T, D]
final_chunk_state_list
.
append
(
final_state
)
x
=
self
.
layernorm_list
[
i
](
x
)
for
i
in
range
(
self
.
num_fc_layers
):
x
=
self
.
fc_layers_list
[
i
](
x
)
x
=
F
.
relu
(
x
)
if
self
.
use_gru
is
True
:
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_list
,
axis
=
0
)
final_chunk_state_c_box
=
init_state_c_box
else
:
final_chunk_state_h_list
=
[
final_chunk_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_c_list
=
[
final_chunk_state_list
[
i
][
1
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_h_list
,
axis
=
0
)
final_chunk_state_c_box
=
paddle
.
concat
(
final_chunk_state_c_list
,
axis
=
0
)
return
x
,
x_lens
,
final_chunk_state_h_box
,
final_chunk_state_c_box
def
forward_chunk_by_chunk
(
self
,
x
,
x_lens
,
decoder_chunk_size
=
8
):
"""Compute Encoder outputs
# convert data from convolution feature map to sequence of vectors
#B, C, D, T = paddle.shape(x) # not work under jit
x
=
x
.
transpose
([
0
,
3
,
1
,
2
])
#[B, T, C, D]
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
x
=
x
.
reshape
([
0
,
0
,
-
1
])
#[B, T, C*D]
# remove padding part
x
,
x_lens
=
self
.
rnn
(
x
,
x_lens
)
#[B, T, D]
return
x
,
x_lens
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
decoder_chunk_size: The chunk size of decoder
Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate
=
self
.
conv
.
subsampling_rate
receptive_field_length
=
self
.
conv
.
receptive_field_length
chunk_size
=
(
decoder_chunk_size
-
1
)
*
subsampling_rate
+
receptive_field_length
chunk_stride
=
subsampling_rate
*
decoder_chunk_size
max_len
=
x
.
shape
[
1
]
assert
(
chunk_size
<=
max_len
)
eouts_chunk_list
=
[]
eouts_chunk_lens_list
=
[]
if
(
max_len
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len
=
chunk_stride
-
(
max_len
-
chunk_size
)
%
chunk_stride
else
:
padding_len
=
0
padding
=
paddle
.
zeros
((
x
.
shape
[
0
],
padding_len
,
x
.
shape
[
2
]))
padded_x
=
paddle
.
concat
([
x
,
padding
],
axis
=
1
)
num_chunk
=
(
max_len
+
padding_len
-
chunk_size
)
/
chunk_stride
+
1
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
None
chunk_state_c_box
=
None
final_state_h_box
=
None
final_state_c_box
=
None
for
i
in
range
(
0
,
num_chunk
):
start
=
i
*
chunk_stride
end
=
start
+
chunk_size
x_chunk
=
padded_x
[:,
start
:
end
,
:]
x_len_left
=
paddle
.
where
(
x_lens
-
i
*
chunk_stride
<
0
,
paddle
.
zeros_like
(
x_lens
),
x_lens
-
i
*
chunk_stride
)
x_chunk_len_tmp
=
paddle
.
ones_like
(
x_lens
)
*
chunk_size
x_chunk_lens
=
paddle
.
where
(
x_len_left
<
x_chunk_len_tmp
,
x_len_left
,
x_chunk_len_tmp
)
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
(
x_chunk
,
x_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
eouts_chunk_list
.
append
(
eouts_chunk
)
eouts_chunk_lens_list
.
append
(
eouts_chunk_lens
)
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
return
eouts_chunk_list
,
eouts_chunk_lens_list
,
final_state_h_box
,
final_state_c_box
class
DeepSpeech2Model
(
nn
.
Layer
):
"""The DeepSpeech2 network structure.
:param audio
_data
: Audio spectrogram data layer.
:type audio
_data
: Variable
:param text
_data
: Transcription text data layer.
:type text
_data
: Variable
:param audio: Audio spectrogram data layer.
:type audio: Variable
:param text: Transcription text data layer.
:type text: Variable
:param audio_len: Valid sequence length data layer.
:type audio_len: Variable
:param
masks: Masks data layer to reset padding
.
:type
masks: Variable
:param
feat_size: feature size for audio
.
:type
feat_size: int
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
...
...
@@ -106,37 +230,41 @@ class DeepSpeech2Model(nn.Layer):
:type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward direction RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
def
__init__
(
self
,
def
__init__
(
self
,
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
,
share_rnn_weights
=
True
,
blank_id
=
0
,
ctc_grad_norm_type
=
None
):
ctc_grad_norm_type
=
None
,
):
super
().
__init__
()
self
.
encoder
=
CRNNEncoder
(
feat_size
=
feat_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_direction
=
rnn_direction
,
num_fc_layers
=
num_fc_layers
,
fc_layers_size_list
=
fc_layers_size_list
,
rnn_size
=
rnn_size
,
use_gru
=
use_gru
,
share_rnn_weights
=
share_rnn_weights
)
assert
(
self
.
encoder
.
output_size
==
rnn_size
*
2
)
use_gru
=
use_gru
)
self
.
decoder
=
CTCDecoder
(
odim
=
dict_size
,
# <blank> is in vocab
...
...
@@ -151,7 +279,7 @@ class DeepSpeech2Model(nn.Layer):
"""Compute Model loss
Args:
audio (Tensor
s
): [B, T, D]
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
text (Tensor): [B, U]
text_len (Tensor): [B]
...
...
@@ -159,22 +287,22 @@ class DeepSpeech2Model(nn.Layer):
Returns:
loss (Tensor): [1]
"""
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
return
loss
@
paddle
.
no_grad
()
def
decode
(
self
,
audio
,
audio_len
):
# decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
batch_size
=
probs
.
shape
[
0
]
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
return
trans_best
@
classmethod
...
...
@@ -196,13 +324,15 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result.
"""
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
feat_size
=
dataloader
.
feat_dim
,
dict_size
=
dataloader
.
vocab_size
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
share_rnn_weights
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
...
...
@@ -229,8 +359,10 @@ class DeepSpeech2Model(nn.Layer):
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
share_rnn_weights
=
config
.
share_rnn_weights
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
return
model
...
...
@@ -240,21 +372,37 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
audio
,
audio_len
):
"""export model function
Args:
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
Returns:
probs: probs after softmax
"""
eouts
,
eouts_len
=
self
.
encoder
(
audio
,
audio_len
)
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
=
None
,
chunk_state_c_box
=
None
):
if
self
.
encoder
.
rnn_direction
==
"forward"
:
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
eouts
,
eouts_len
,
_
,
_
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
return
probs
,
eouts_len
else
:
raise
Exception
(
"wrong model type"
)
def
export
(
self
):
if
self
.
encoder
.
rnn_direction
==
"forward"
:
static_model
=
paddle
.
jit
.
to_static
(
self
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
)
])
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
static_model
=
paddle
.
jit
.
to_static
(
self
,
input_spec
=
[
...
...
@@ -264,4 +412,6 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
])
else
:
raise
Exception
(
"wrong model type"
)
return
static_model
paddlespeech/s2t/models/ds2/rnn.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
from
paddle.nn
import
initializer
as
I
from
paddlespeech.s2t.modules.activation
import
brelu
from
paddlespeech.s2t.modules.mask
import
make_non_pad_mask
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'RNNStack'
]
class
RNNCell
(
nn
.
RNNCellBase
):
r
"""
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
computes the outputs and updates states.
The formula used is as follows:
.. math::
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
y_{t} & = h_{t}
where :math:`act` is for :attr:`activation`.
"""
def
__init__
(
self
,
hidden_size
:
int
,
activation
=
"tanh"
,
weight_ih_attr
=
None
,
weight_hh_attr
=
None
,
bias_ih_attr
=
None
,
bias_hh_attr
=
None
,
name
=
None
):
super
().
__init__
()
std
=
1.0
/
math
.
sqrt
(
hidden_size
)
self
.
weight_hh
=
self
.
create_parameter
(
(
hidden_size
,
hidden_size
),
weight_hh_attr
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
bias_ih
=
None
self
.
bias_hh
=
self
.
create_parameter
(
(
hidden_size
,
),
bias_hh_attr
,
is_bias
=
True
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
hidden_size
=
hidden_size
if
activation
not
in
[
"tanh"
,
"relu"
,
"brelu"
]:
raise
ValueError
(
"activation for SimpleRNNCell should be tanh or relu, "
"but get {}"
.
format
(
activation
))
self
.
activation
=
activation
self
.
_activation_fn
=
paddle
.
tanh
\
if
activation
==
"tanh"
\
else
F
.
relu
if
activation
==
'brelu'
:
self
.
_activation_fn
=
brelu
def
forward
(
self
,
inputs
,
states
=
None
):
if
states
is
None
:
states
=
self
.
get_initial_states
(
inputs
,
self
.
state_shape
)
pre_h
=
states
i2h
=
inputs
if
self
.
bias_ih
is
not
None
:
i2h
+=
self
.
bias_ih
h2h
=
paddle
.
matmul
(
pre_h
,
self
.
weight_hh
,
transpose_y
=
True
)
if
self
.
bias_hh
is
not
None
:
h2h
+=
self
.
bias_hh
h
=
self
.
_activation_fn
(
i2h
+
h2h
)
return
h
,
h
@
property
def
state_shape
(
self
):
return
(
self
.
hidden_size
,
)
class
GRUCell
(
nn
.
RNNCellBase
):
r
"""
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
it computes the outputs and updates states.
The formula for GRU used is as follows:
.. math::
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
y_{t} & = h_{t}
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
multiplication operator.
"""
def
__init__
(
self
,
input_size
:
int
,
hidden_size
:
int
,
weight_ih_attr
=
None
,
weight_hh_attr
=
None
,
bias_ih_attr
=
None
,
bias_hh_attr
=
None
,
name
=
None
):
super
().
__init__
()
std
=
1.0
/
math
.
sqrt
(
hidden_size
)
self
.
weight_hh
=
self
.
create_parameter
(
(
3
*
hidden_size
,
hidden_size
),
weight_hh_attr
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
bias_ih
=
None
self
.
bias_hh
=
self
.
create_parameter
(
(
3
*
hidden_size
,
),
bias_hh_attr
,
is_bias
=
True
,
default_initializer
=
I
.
Uniform
(
-
std
,
std
))
self
.
hidden_size
=
hidden_size
self
.
input_size
=
input_size
self
.
_gate_activation
=
F
.
sigmoid
self
.
_activation
=
paddle
.
tanh
def
forward
(
self
,
inputs
,
states
=
None
):
if
states
is
None
:
states
=
self
.
get_initial_states
(
inputs
,
self
.
state_shape
)
pre_hidden
=
states
x_gates
=
inputs
if
self
.
bias_ih
is
not
None
:
x_gates
=
x_gates
+
self
.
bias_ih
h_gates
=
paddle
.
matmul
(
pre_hidden
,
self
.
weight_hh
,
transpose_y
=
True
)
if
self
.
bias_hh
is
not
None
:
h_gates
=
h_gates
+
self
.
bias_hh
x_r
,
x_z
,
x_c
=
paddle
.
split
(
x_gates
,
num_or_sections
=
3
,
axis
=
1
)
h_r
,
h_z
,
h_c
=
paddle
.
split
(
h_gates
,
num_or_sections
=
3
,
axis
=
1
)
r
=
self
.
_gate_activation
(
x_r
+
h_r
)
z
=
self
.
_gate_activation
(
x_z
+
h_z
)
c
=
self
.
_activation
(
x_c
+
r
*
h_c
)
# apply reset gate after mm
h
=
(
pre_hidden
-
c
)
*
z
+
c
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
return
h
,
h
@
property
def
state_shape
(
self
):
r
"""
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
size would be automatically inserted into shape). The shape corresponds
to the shape of :math:`h_{t-1}`.
"""
return
(
self
.
hidden_size
,
)
class
BiRNNWithBN
(
nn
.
Layer
):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param size: Dimension of RNN cells.
:type size: int
:param share_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
:type share_weights: bool
:return: Bidirectional simple rnn layer.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
,
share_weights
:
bool
):
super
().
__init__
()
self
.
share_weights
=
share_weights
if
self
.
share_weights
:
#input-hidden weights shared between bi-directional rnn.
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
# batch norm is only performed on input-state projection
self
.
fw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
self
.
fw_fc
self
.
bw_bn
=
self
.
fw_bn
else
:
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
self
.
fw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
nn
.
Linear
(
i_size
,
h_size
,
bias_attr
=
False
)
self
.
bw_bn
=
nn
.
BatchNorm1D
(
h_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
fw_cell
=
RNNCell
(
hidden_size
=
h_size
,
activation
=
'brelu'
)
self
.
bw_cell
=
RNNCell
(
hidden_size
=
h_size
,
activation
=
'brelu'
)
self
.
fw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
False
,
time_major
=
False
)
#[B, T, D]
self
.
bw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
True
,
time_major
=
False
)
#[B, T, D]
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
):
# x, shape [B, T, D]
fw_x
=
self
.
fw_bn
(
self
.
fw_fc
(
x
))
bw_x
=
self
.
bw_bn
(
self
.
bw_fc
(
x
))
fw_x
,
_
=
self
.
fw_rnn
(
inputs
=
fw_x
,
sequence_length
=
x_len
)
bw_x
,
_
=
self
.
bw_rnn
(
inputs
=
bw_x
,
sequence_length
=
x_len
)
x
=
paddle
.
concat
([
fw_x
,
bw_x
],
axis
=-
1
)
return
x
,
x_len
class
BiGRUWithBN
(
nn
.
Layer
):
"""Bidirectonal gru layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: Variable
:param size: Dimension of GRU cells.
:type size: int
:param act: Activation type.
:type act: string
:return: Bidirectional GRU layer.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
):
super
().
__init__
()
hidden_size
=
h_size
*
3
self
.
fw_fc
=
nn
.
Linear
(
i_size
,
hidden_size
,
bias_attr
=
False
)
self
.
fw_bn
=
nn
.
BatchNorm1D
(
hidden_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
bw_fc
=
nn
.
Linear
(
i_size
,
hidden_size
,
bias_attr
=
False
)
self
.
bw_bn
=
nn
.
BatchNorm1D
(
hidden_size
,
bias_attr
=
None
,
data_format
=
'NLC'
)
self
.
fw_cell
=
GRUCell
(
input_size
=
hidden_size
,
hidden_size
=
h_size
)
self
.
bw_cell
=
GRUCell
(
input_size
=
hidden_size
,
hidden_size
=
h_size
)
self
.
fw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
False
,
time_major
=
False
)
#[B, T, D]
self
.
bw_rnn
=
nn
.
RNN
(
self
.
fw_cell
,
is_reverse
=
True
,
time_major
=
False
)
#[B, T, D]
def
forward
(
self
,
x
,
x_len
):
# x, shape [B, T, D]
fw_x
=
self
.
fw_bn
(
self
.
fw_fc
(
x
))
bw_x
=
self
.
bw_bn
(
self
.
bw_fc
(
x
))
fw_x
,
_
=
self
.
fw_rnn
(
inputs
=
fw_x
,
sequence_length
=
x_len
)
bw_x
,
_
=
self
.
bw_rnn
(
inputs
=
bw_x
,
sequence_length
=
x_len
)
x
=
paddle
.
concat
([
fw_x
,
bw_x
],
axis
=-
1
)
return
x
,
x_len
class
RNNStack
(
nn
.
Layer
):
"""RNN group with stacked bidirectional simple RNN or GRU layers.
:param input: Input layer.
:type input: Variable
:param size: Dimension of RNN cells in each layer.
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: Output layer of the RNN group.
:rtype: Variable
"""
def
__init__
(
self
,
i_size
:
int
,
h_size
:
int
,
num_stacks
:
int
,
use_gru
:
bool
,
share_rnn_weights
:
bool
):
super
().
__init__
()
rnn_stacks
=
[]
for
i
in
range
(
num_stacks
):
if
use_gru
:
#default:GRU using tanh
rnn_stacks
.
append
(
BiGRUWithBN
(
i_size
=
i_size
,
h_size
=
h_size
))
else
:
rnn_stacks
.
append
(
BiRNNWithBN
(
i_size
=
i_size
,
h_size
=
h_size
,
share_weights
=
share_rnn_weights
))
i_size
=
h_size
*
2
self
.
rnn_stacks
=
nn
.
LayerList
(
rnn_stacks
)
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
):
"""
x: shape [B, T, D]
x_len: shpae [B]
"""
for
i
,
rnn
in
enumerate
(
self
.
rnn_stacks
):
x
,
x_len
=
rnn
(
x
,
x_len
)
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
-
1
)
# [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
paddlespeech/s2t/models/ds2_online/__init__.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.deepspeech2
import
DeepSpeech2InferModelOnline
from
.deepspeech2
import
DeepSpeech2ModelOnline
from
paddlespeech.s2t.utils
import
dynamic_pip_install
import
sys
try
:
import
paddlespeech_ctcdecoders
except
ImportError
:
try
:
package_name
=
'paddlespeech_ctcdecoders'
if
sys
.
platform
!=
"win32"
:
dynamic_pip_install
.
install
(
package_name
)
except
Exception
:
raise
RuntimeError
(
"Can not install package paddlespeech_ctcdecoders on your system.
\
The DeepSpeech2 model is not supported for your system"
)
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModelOnline'
]
paddlespeech/s2t/models/ds2_online/conv.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddlespeech.s2t.modules.subsampling
import
Conv2dSubsampling4
class
Conv2dSubsampling4Online
(
Conv2dSubsampling4
):
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
):
super
().
__init__
(
idim
,
odim
,
dropout_rate
,
None
)
self
.
output_dim
=
((
idim
-
1
)
//
2
-
1
)
//
2
*
odim
self
.
receptive_field_length
=
2
*
(
3
-
1
)
+
3
# stride_1 * (kernel_size_2 - 1) + kerel_size_1
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
)
->
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
x
=
self
.
conv
(
x
)
#b, c, t, f = paddle.shape(x) #not work under jit
x
=
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
0
,
0
,
-
1
])
x_len
=
((
x_len
-
1
)
//
2
-
1
)
//
2
return
x
,
x_len
paddlespeech/s2t/models/ds2_online/deepspeech2.py
已删除
100644 → 0
浏览文件 @
8b1c1ec4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Deepspeech2 ASR Online Model"""
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddlespeech.s2t.models.ds2_online.conv
import
Conv2dSubsampling4Online
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.utils
import
layer_tools
from
paddlespeech.s2t.utils.checkpoint
import
Checkpoint
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModelOnline'
]
class
CRNNEncoder
(
nn
.
Layer
):
def
__init__
(
self
,
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
):
super
().
__init__
()
self
.
rnn_size
=
rnn_size
self
.
feat_size
=
feat_size
# 161 for linear
self
.
dict_size
=
dict_size
self
.
num_rnn_layers
=
num_rnn_layers
self
.
num_fc_layers
=
num_fc_layers
self
.
rnn_direction
=
rnn_direction
self
.
fc_layers_size_list
=
fc_layers_size_list
self
.
use_gru
=
use_gru
self
.
conv
=
Conv2dSubsampling4Online
(
feat_size
,
32
,
dropout_rate
=
0.0
)
self
.
output_dim
=
self
.
conv
.
output_dim
i_size
=
self
.
conv
.
output_dim
self
.
rnn
=
nn
.
LayerList
()
self
.
layernorm_list
=
nn
.
LayerList
()
self
.
fc_layers_list
=
nn
.
LayerList
()
if
rnn_direction
==
'bidirect'
or
rnn_direction
==
'bidirectional'
:
layernorm_size
=
2
*
rnn_size
elif
rnn_direction
==
'forward'
:
layernorm_size
=
rnn_size
else
:
raise
Exception
(
"Wrong rnn direction"
)
for
i
in
range
(
0
,
num_rnn_layers
):
if
i
==
0
:
rnn_input_size
=
i_size
else
:
rnn_input_size
=
layernorm_size
if
use_gru
is
True
:
self
.
rnn
.
append
(
nn
.
GRU
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
else
:
self
.
rnn
.
append
(
nn
.
LSTM
(
input_size
=
rnn_input_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
self
.
output_dim
=
layernorm_size
fc_input_size
=
layernorm_size
for
i
in
range
(
self
.
num_fc_layers
):
self
.
fc_layers_list
.
append
(
nn
.
Linear
(
fc_input_size
,
fc_layers_size_list
[
i
]))
fc_input_size
=
fc_layers_size_list
[
i
]
self
.
output_dim
=
fc_layers_size_list
[
i
]
@
property
def
output_size
(
self
):
return
self
.
output_dim
def
forward
(
self
,
x
,
x_lens
,
init_state_h_box
=
None
,
init_state_c_box
=
None
):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
Return:
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
if
init_state_h_box
is
not
None
:
init_state_list
=
None
if
self
.
use_gru
is
True
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
init_state_h_list
else
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
[(
init_state_h_list
[
i
],
init_state_c_list
[
i
])
for
i
in
range
(
self
.
num_rnn_layers
)]
else
:
init_state_list
=
[
None
]
*
self
.
num_rnn_layers
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
final_chunk_state_list
=
[]
for
i
in
range
(
0
,
self
.
num_rnn_layers
):
x
,
final_state
=
self
.
rnn
[
i
](
x
,
init_state_list
[
i
],
x_lens
)
#[B, T, D]
final_chunk_state_list
.
append
(
final_state
)
x
=
self
.
layernorm_list
[
i
](
x
)
for
i
in
range
(
self
.
num_fc_layers
):
x
=
self
.
fc_layers_list
[
i
](
x
)
x
=
F
.
relu
(
x
)
if
self
.
use_gru
is
True
:
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_list
,
axis
=
0
)
final_chunk_state_c_box
=
init_state_c_box
else
:
final_chunk_state_h_list
=
[
final_chunk_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_c_list
=
[
final_chunk_state_list
[
i
][
1
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_h_list
,
axis
=
0
)
final_chunk_state_c_box
=
paddle
.
concat
(
final_chunk_state_c_list
,
axis
=
0
)
return
x
,
x_lens
,
final_chunk_state_h_box
,
final_chunk_state_c_box
def
forward_chunk_by_chunk
(
self
,
x
,
x_lens
,
decoder_chunk_size
=
8
):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
decoder_chunk_size: The chunk size of decoder
Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate
=
self
.
conv
.
subsampling_rate
receptive_field_length
=
self
.
conv
.
receptive_field_length
chunk_size
=
(
decoder_chunk_size
-
1
)
*
subsampling_rate
+
receptive_field_length
chunk_stride
=
subsampling_rate
*
decoder_chunk_size
max_len
=
x
.
shape
[
1
]
assert
(
chunk_size
<=
max_len
)
eouts_chunk_list
=
[]
eouts_chunk_lens_list
=
[]
if
(
max_len
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len
=
chunk_stride
-
(
max_len
-
chunk_size
)
%
chunk_stride
else
:
padding_len
=
0
padding
=
paddle
.
zeros
((
x
.
shape
[
0
],
padding_len
,
x
.
shape
[
2
]))
padded_x
=
paddle
.
concat
([
x
,
padding
],
axis
=
1
)
num_chunk
=
(
max_len
+
padding_len
-
chunk_size
)
/
chunk_stride
+
1
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
None
chunk_state_c_box
=
None
final_state_h_box
=
None
final_state_c_box
=
None
for
i
in
range
(
0
,
num_chunk
):
start
=
i
*
chunk_stride
end
=
start
+
chunk_size
x_chunk
=
padded_x
[:,
start
:
end
,
:]
x_len_left
=
paddle
.
where
(
x_lens
-
i
*
chunk_stride
<
0
,
paddle
.
zeros_like
(
x_lens
),
x_lens
-
i
*
chunk_stride
)
x_chunk_len_tmp
=
paddle
.
ones_like
(
x_lens
)
*
chunk_size
x_chunk_lens
=
paddle
.
where
(
x_len_left
<
x_chunk_len_tmp
,
x_len_left
,
x_chunk_len_tmp
)
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
(
x_chunk
,
x_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
eouts_chunk_list
.
append
(
eouts_chunk
)
eouts_chunk_lens_list
.
append
(
eouts_chunk_lens
)
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
return
eouts_chunk_list
,
eouts_chunk_lens_list
,
final_state_h_box
,
final_state_c_box
class
DeepSpeech2ModelOnline
(
nn
.
Layer
):
"""The DeepSpeech2 network structure for online.
:param audio: Audio spectrogram data layer.
:type audio: Variable
:param text: Transcription text data layer.
:type text: Variable
:param audio_len: Valid sequence length data layer.
:type audio_len: Variable
:param feat_size: feature size for audio.
:type feat_size: int
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
def
__init__
(
self
,
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
,
blank_id
=
0
,
ctc_grad_norm_type
=
None
,
):
super
().
__init__
()
self
.
encoder
=
CRNNEncoder
(
feat_size
=
feat_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_direction
=
rnn_direction
,
num_fc_layers
=
num_fc_layers
,
fc_layers_size_list
=
fc_layers_size_list
,
rnn_size
=
rnn_size
,
use_gru
=
use_gru
)
self
.
decoder
=
CTCDecoder
(
odim
=
dict_size
,
# <blank> is in vocab
enc_n_units
=
self
.
encoder
.
output_size
,
blank_id
=
blank_id
,
dropout_rate
=
0.0
,
reduction
=
True
,
# sum
batch_average
=
True
,
# sum / batch_size
grad_norm_type
=
ctc_grad_norm_type
)
def
forward
(
self
,
audio
,
audio_len
,
text
,
text_len
):
"""Compute Model loss
Args:
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
text (Tensor): [B, U]
text_len (Tensor): [B]
Returns:
loss (Tensor): [1]
"""
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
return
loss
@
paddle
.
no_grad
()
def
decode
(
self
,
audio
,
audio_len
):
# decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
,
None
,
None
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
batch_size
=
probs
.
shape
[
0
]
self
.
decoder
.
reset_decoder
(
batch_size
=
batch_size
)
self
.
decoder
.
next
(
probs
,
eouts_len
)
trans_best
,
trans_beam
=
self
.
decoder
.
decode
()
return
trans_best
@
classmethod
def
from_pretrained
(
cls
,
dataloader
,
config
,
checkpoint_path
):
"""Build a DeepSpeech2Model model from a pretrained model.
Parameters
----------
dataloader: paddle.io.DataLoader
config: yacs.config.CfgNode
model configs
checkpoint_path: Path or str
the path of pretrained model checkpoint, without extension name
Returns
-------
DeepSpeech2ModelOnline
The model built from pretrained result.
"""
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
layer_tools
.
summary
(
model
)
return
model
@
classmethod
def
from_config
(
cls
,
config
):
"""Build a DeepSpeec2ModelOnline from config
Parameters
config: yacs.config.CfgNode
config
Returns
-------
DeepSpeech2ModelOnline
The model built from config.
"""
model
=
cls
(
feat_size
=
config
.
input_dim
,
dict_size
=
config
.
output_dim
,
num_conv_layers
=
config
.
num_conv_layers
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_size
=
config
.
rnn_layer_size
,
rnn_direction
=
config
.
rnn_direction
,
num_fc_layers
=
config
.
num_fc_layers
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
get
(
'ctc_grad_norm_type'
,
None
),
)
return
model
class
DeepSpeech2InferModelOnline
(
DeepSpeech2ModelOnline
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
):
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
def
export
(
self
):
static_model
=
paddle
.
jit
.
to_static
(
self
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
None
],
dtype
=
'float32'
)
])
return
static_model
paddlespeech/server/engine/asr/online/asr_engine.py
浏览文件 @
919c8d06
...
...
@@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
from
paddlespeech.cli.utils
import
MODEL_HOME
from
paddlespeech.resource
import
CommonTaskResource
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.speech
import
SpeechSegment
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.transform.transformation
import
Transformation
from
paddlespeech.s2t.utils.tensor_utils
import
add_sos_eos
...
...
@@ -66,10 +65,13 @@ class PaddleASRConnectionHanddler:
self
.
text_feature
=
self
.
asr_engine
.
executor
.
text_feature
if
"deepspeech2"
in
self
.
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
am_predictor
=
self
.
asr_engine
.
executor
.
am_predictor
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
model_config
)
# extract feat, new only fbank in conformer model
self
.
preprocess_conf
=
self
.
model_config
.
preprocess_config
self
.
preprocess_args
=
{
"train"
:
False
}
self
.
preprocessing
=
Transformation
(
self
.
preprocess_conf
)
self
.
decoder
=
CTCDecoder
(
odim
=
self
.
model_config
.
output_dim
,
# <blank> is in vocab
enc_n_units
=
self
.
model_config
.
rnn_layer_size
*
2
,
...
...
@@ -89,10 +91,8 @@ class PaddleASRConnectionHanddler:
cfg
.
num_proc_bsearch
)
# frame window and frame shift, in samples unit
self
.
win_length
=
int
(
self
.
model_config
.
window_ms
/
1000
*
self
.
sample_rate
)
self
.
n_shift
=
int
(
self
.
model_config
.
stride_ms
/
1000
*
self
.
sample_rate
)
self
.
win_length
=
self
.
preprocess_conf
.
process
[
0
][
'win_length'
]
self
.
n_shift
=
self
.
preprocess_conf
.
process
[
0
][
'n_shift'
]
elif
"conformer"
in
self
.
model_type
or
"transformer"
in
self
.
model_type
:
# acoustic model
...
...
@@ -123,11 +123,6 @@ class PaddleASRConnectionHanddler:
samples
=
np
.
frombuffer
(
samples
,
dtype
=
np
.
int16
)
assert
samples
.
ndim
==
1
# pcm16 -> pcm 32
# pcm2float will change the orignal samples,
# so we shoule do pcm2float before concatenate
samples
=
pcm2float
(
samples
)
if
self
.
remained_wav
is
None
:
self
.
remained_wav
=
samples
else
:
...
...
@@ -137,26 +132,11 @@ class PaddleASRConnectionHanddler:
f
"The connection remain the audio samples:
{
self
.
remained_wav
.
shape
}
"
)
# read audio
speech_segment
=
SpeechSegment
.
from_pcm
(
self
.
remained_wav
,
self
.
sample_rate
,
transcript
=
" "
)
# audio augment
self
.
collate_fn_test
.
augmentation
.
transform_audio
(
speech_segment
)
# extract speech feature
spectrum
,
transcript_part
=
self
.
collate_fn_test
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
collate_fn_test
.
keep_transcription_text
)
# CMVN spectrum
if
self
.
collate_fn_test
.
_normalizer
:
spectrum
=
self
.
collate_fn_test
.
_normalizer
.
apply
(
spectrum
)
# spectrum augment
feat
=
self
.
collate_fn_test
.
augmentation
.
transform_feature
(
spectrum
)
# audio_len is frame num
frame_num
=
feat
.
shape
[
0
]
feat
=
paddle
.
to_tensor
(
feat
,
dtype
=
'float32'
)
feat
=
paddle
.
unsqueeze
(
feat
,
axis
=
0
)
# fbank
feat
=
self
.
preprocessing
(
self
.
remained_wav
,
**
self
.
preprocess_args
)
feat
=
paddle
.
to_tensor
(
feat
,
dtype
=
"float32"
).
unsqueeze
(
axis
=
0
)
if
self
.
cached_feat
is
None
:
self
.
cached_feat
=
feat
...
...
@@ -170,8 +150,11 @@ class PaddleASRConnectionHanddler:
if
self
.
device
is
None
:
self
.
device
=
self
.
cached_feat
.
place
self
.
num_frames
+=
frame_num
self
.
remained_wav
=
self
.
remained_wav
[
self
.
n_shift
*
frame_num
:]
# cur frame step
num_frames
=
feat
.
shape
[
1
]
self
.
num_frames
+=
num_frames
self
.
remained_wav
=
self
.
remained_wav
[
self
.
n_shift
*
num_frames
:]
logger
.
info
(
f
"process the audio feature success, the connection feat shape:
{
self
.
cached_feat
.
shape
}
"
...
...
@@ -752,16 +735,19 @@ class ASRServerExecutor(ASRExecutor):
self
.
config
=
CfgNode
(
new_allowed
=
True
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
vocab
=
self
.
config
.
vocab_filepath
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2"
in
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
...
...
@@ -772,14 +758,6 @@ class ASRServerExecutor(ASRExecutor):
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
logger
.
info
(
"start to create the stream conformer asr engine"
)
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
vocab
=
self
.
config
.
vocab_filepath
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
config
.
vocab_filepath
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
# update the decoding method
if
decode_method
:
self
.
config
.
decode
.
decoding_method
=
decode_method
...
...
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
浏览文件 @
919c8d06
...
...
@@ -54,6 +54,7 @@ class ASRServerExecutor(ASRExecutor):
self
.
max_len
=
50
sample_rate_str
=
'16k'
if
sample_rate
==
16000
else
'8k'
tag
=
model_type
+
'-'
+
lang
+
'-'
+
sample_rate_str
self
.
max_len
=
50
self
.
task_resource
.
set_task_model
(
model_tag
=
tag
)
if
cfg_path
is
None
or
am_model
is
None
or
am_params
is
None
:
self
.
res_path
=
self
.
task_resource
.
res_dir
...
...
@@ -80,22 +81,25 @@ class ASRServerExecutor(ASRExecutor):
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
with
UpdateConfig
(
self
.
config
):
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
from
paddlespeech.s2t.io.collator
import
SpeechCollator
if
"deepspeech2"
in
model_type
:
self
.
vocab
=
self
.
config
.
vocab_filepath
if
self
.
config
.
spm_model_prefix
:
self
.
config
.
spm_model_prefix
=
os
.
path
.
join
(
self
.
res_path
,
self
.
config
.
spm_model_prefix
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
,
spm_model_prefix
=
self
.
config
.
spm_model_prefix
)
self
.
config
.
decode
.
lang_model_path
=
os
.
path
.
join
(
MODEL_HOME
,
'language_model'
,
self
.
config
.
decode
.
lang_model_path
)
self
.
collate_fn_test
=
SpeechCollator
.
from_config
(
self
.
config
)
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
self
.
config
.
unit_type
,
vocab
=
self
.
vocab
)
lm_url
=
self
.
task_resource
.
res_dict
[
'lm_url'
]
lm_md5
=
self
.
task_resource
.
res_dict
[
'lm_md5'
]
self
.
download_lm
(
lm_url
,
os
.
path
.
dirname
(
self
.
config
.
decode
.
lang_model_path
),
lm_md5
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
or
"wenetspeech"
in
model_type
:
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
:
raise
Exception
(
"wrong type"
)
else
:
raise
Exception
(
"wrong type"
)
...
...
@@ -125,7 +129,7 @@ class ASRServerExecutor(ASRExecutor):
cfg
=
self
.
config
.
decode
audio
=
self
.
_inputs
[
"audio"
]
audio_len
=
self
.
_inputs
[
"audio_len"
]
if
"deepspeech2
online"
in
model_type
or
"deepspeech2offline
"
in
model_type
:
if
"deepspeech2"
in
model_type
:
decode_batch_size
=
audio
.
shape
[
0
]
# init once
self
.
decoder
.
init_decoder
(
...
...
@@ -222,7 +226,6 @@ class PaddleASRConnectionHandler(ASRServerExecutor):
self
.
decoder
=
self
.
executor
.
decoder
self
.
am_predictor
=
self
.
executor
.
am_predictor
self
.
text_feature
=
self
.
executor
.
text_feature
self
.
collate_fn_test
=
self
.
executor
.
collate_fn_test
def
run
(
self
,
audio_data
):
"""engine run
...
...
paddlespeech/server/engine/tts/online/python/tts_engine.py
浏览文件 @
919c8d06
...
...
@@ -40,7 +40,7 @@ class TTSServerExecutor(TTSExecutor):
def
__init__
(
self
):
super
().
__init__
()
self
.
task_resource
=
CommonTaskResource
(
task
=
'tts'
,
model_format
=
'
stat
ic'
,
inference_mode
=
'online'
)
task
=
'tts'
,
model_format
=
'
dynam
ic'
,
inference_mode
=
'online'
)
def
get_model_info
(
self
,
...
...
speechx/CMakeLists.txt
浏览文件 @
919c8d06
...
...
@@ -142,4 +142,3 @@ set(DEPS ${DEPS}
set
(
SPEECHX_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/speechx
)
add_subdirectory
(
speechx
)
add_subdirectory
(
examples
)
speechx/examples/custom_asr/run.sh
浏览文件 @
919c8d06
...
...
@@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
recognizer_test_main
\
--wav_rspecifier
=
scp:
$wav_scp
\
--cmvn_file
=
$cmvn
\
--streaming_chunk
=
30
\
--use_fbank
=
true
\
--model_path
=
$model_dir
/avg_10.jit.pdmodel
\
--param_path
=
$model_dir
/avg_10.jit.pdiparams
\
...
...
speechx/examples/ds2_ol/README.md
浏览文件 @
919c8d06
...
...
@@ -2,13 +2,5 @@
## Examples
*
`websocket`
- Streaming ASR with websocket.
*
`websocket`
- Streaming ASR with websocket for deepspeech2_aishell.
*
`aishell`
- Streaming Decoding under aishell dataset, for local WER test.
\ No newline at end of file
## More
> The below is for developing and offline testing. Do not run it only if you know what it is.
*
nnet
*
feat
*
decoder
speechx/examples/ds2_ol/aishell/path.sh
浏览文件 @
919c8d06
...
...
@@ -20,5 +20,5 @@ export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
:-}
:
${
LIBLBFGS
}
/lib/.libs
export
SRILM
=
${
MAIN_ROOT
}
/tools/srilm
SPEECHX_BIN
=
$SPEECHX_BUILD
/decoder:
$SPEECHX_BUILD
/frontend/audio
:
$SPEECHX_BUILD
/websocket
SPEECHX_BIN
=
$SPEECHX_BUILD
/decoder:
$SPEECHX_BUILD
/frontend/audio
export
PATH
=
$PATH
:
$SPEECHX_BIN
:
$TOOLS_BIN
:
${
SRILM
}
/bin:
${
SRILM
}
/bin/i686-m64:
$KALDI_DIR
/lmbin:
$KALDI_DIR
/fstbin:
$OPENFST_DIR
/bin
speechx/examples/ds2_ol/aishell/run.sh
浏览文件 @
919c8d06
...
...
@@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--feature_wspecifier
=
ark,scp:
$data
/split
${
nj
}
/JOB/feat.ark,
$data
/split
${
nj
}
/JOB/feat.scp
\
--cmvn_file
=
$cmvn
\
--streaming_chunk
=
0.36
echo
"feature make have finished!!!"
fi
...
...
@@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--streaming_chunk
=
30
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
...
...
speechx/examples/ds2_ol/aishell/run_fbank.sh
浏览文件 @
919c8d06
...
...
@@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier
=
scp:
$data
/split
${
nj
}
/JOB/
${
aishell_wav_scp
}
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_5.jit.pdmodel
\
--streaming_chunk
=
30
\
--use_fbank
=
true
\
--param_path
=
$model_dir
/avg_5.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
...
...
speechx/examples/ds2_ol/websocket/path.sh
浏览文件 @
919c8d06
...
...
@@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
export
LC_AL
=
C
SPEECHX_BIN
=
$SPEECHX_BUILD
/protocol/websocket
SPEECHX_BIN
=
$SPEECHX_BUILD
/protocol/websocket
:
$SPEECHX_BUILD
/frontend/audio
export
PATH
=
$PATH
:
$SPEECHX_BIN
:
$TOOLS_BIN
speechx/examples/ds2_ol/websocket/websocket_client.sh
浏览文件 @
919c8d06
...
...
@@ -32,4 +32,4 @@ export GLOG_logtostderr=1
# websocket client
websocket_client_main
\
--wav_rspecifier
=
scp:
$data
/
$aishell_wav_scp
--streaming_chunk
=
0.36
\ No newline at end of file
--wav_rspecifier
=
scp:
$data
/
$aishell_wav_scp
--streaming_chunk
=
0.5
speechx/examples/ds2_ol/websocket/websocket_server.sh
浏览文件 @
919c8d06
...
...
@@ -4,7 +4,6 @@ set -e
.
path.sh
# 1. compile
if
[
!
-d
${
SPEECHX_EXAMPLES
}
]
;
then
pushd
${
SPEECHX_ROOT
}
...
...
@@ -19,19 +18,6 @@ ckpt_dir=$data/model
model_dir
=
$ckpt_dir
/exp/deepspeech2_online/checkpoints/
vocb_dir
=
$ckpt_dir
/data/lang_char/
# output
aishell_wav_scp
=
aishell_test.scp
if
[
!
-d
$data
/test
]
;
then
pushd
$data
wget
-c
https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip aishell_test.zip
popd
realpath
$data
/test/
*
/
*
.wav
>
$data
/wavlist
awk
-F
'/'
'{ print $(NF) }'
$data
/wavlist |
awk
-F
'.'
'{ print $1 }'
>
$data
/utt_id
paste
$data
/utt_id
$data
/wavlist
>
$data
/
$aishell_wav_scp
fi
if
[
!
-f
$ckpt_dir
/data/mean_std.json
]
;
then
mkdir
-p
$ckpt_dir
...
...
@@ -62,7 +48,6 @@ fi
websocket_server_main
\
--cmvn_file
=
$cmvn
\
--model_path
=
$model_dir
/avg_1.jit.pdmodel
\
--streaming_chunk
=
0.1
\
--param_path
=
$model_dir
/avg_1.jit.pdiparams
\
--word_symbol_table
=
$wfst
/words.txt
\
--model_output_names
=
softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
\
...
...
speechx/speechx/decoder/param.h
浏览文件 @
919c8d06
...
...
@@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
// feature, or fbank");
DEFINE_int32
(
num_bins
,
161
,
"num bins of mel"
);
DEFINE_string
(
cmvn_file
,
""
,
"read cmvn"
);
DEFINE_double
(
streaming_chunk
,
0.1
,
"streaming feature chunk size"
);
// feature sliding window
DEFINE_int32
(
receptive_field_length
,
7
,
...
...
@@ -62,7 +61,6 @@ namespace ppspeech {
FeaturePipelineOptions
InitFeaturePipelineOptions
()
{
FeaturePipelineOptions
opts
;
opts
.
cmvn_file
=
FLAGS_cmvn_file
;
opts
.
linear_spectrogram_opts
.
streaming_chunk
=
FLAGS_streaming_chunk
;
kaldi
::
FrameExtractionOptions
frame_opts
;
frame_opts
.
dither
=
0.0
;
frame_opts
.
frame_shift_ms
=
10
;
...
...
@@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts
.
to_float32
=
false
;
frame_opts
.
window_type
=
"povey"
;
frame_opts
.
frame_length_ms
=
25
;
opts
.
fbank_opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
f
bank_opts
.
f
rame_opts
=
frame_opts
;
opts
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opts
.
fbank_opts
.
frame_opts
=
frame_opts
;
}
else
{
opts
.
to_float32
=
true
;
frame_opts
.
remove_dc_offset
=
false
;
...
...
speechx/speechx/decoder/recognizer_main.cc
浏览文件 @
919c8d06
...
...
@@ -19,6 +19,7 @@
DEFINE_string
(
wav_rspecifier
,
""
,
"test feature rspecifier"
);
DEFINE_string
(
result_wspecifier
,
""
,
"test result wspecifier"
);
DEFINE_double
(
streaming_chunk
,
0.36
,
"streaming feature chunk size"
);
DEFINE_int32
(
sample_rate
,
16000
,
"sample rate"
);
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
speechx/speechx/frontend/audio/audio_cache.h
浏览文件 @
919c8d06
...
...
@@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
// the audio dim is 1, one sample
virtual
size_t
Dim
()
const
{
return
1
;
}
// the audio dim is 1, one sample, which is useless,
// so we return size_(cache samples) instead.
virtual
size_t
Dim
()
const
{
return
size_
;
}
virtual
void
SetFinished
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
...
...
speechx/speechx/frontend/audio/compute_fbank_main.cc
浏览文件 @
919c8d06
...
...
@@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
data_source
(
new
ppspeech
::
AudioCache
(
3600
*
1600
,
false
));
ppspeech
::
FbankOptions
opt
;
opt
.
fbank_opts
.
frame_opts
.
frame_length_ms
=
25
;
opt
.
fbank_opts
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opt
.
fbank_opts
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opt
.
fbank_opts
.
frame_opts
.
dither
=
0.0
;
kaldi
::
FbankOptions
opt
;
opt
.
frame_opts
.
frame_length_ms
=
25
;
opt
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
mel_opts
.
num_bins
=
FLAGS_num_bins
;
opt
.
frame_opts
.
dither
=
0.0
;
std
::
unique_ptr
<
ppspeech
::
FrontendInterface
>
fbank
(
new
ppspeech
::
Fbank
(
opt
,
std
::
move
(
data_source
)));
...
...
speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
浏览文件 @
919c8d06
...
...
@@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
ppspeech
::
LinearSpectrogramOptions
opt
;
opt
.
frame_opts
.
frame_length_ms
=
20
;
opt
.
frame_opts
.
frame_shift_ms
=
10
;
opt
.
streaming_chunk
=
FLAGS_streaming_chunk
;
opt
.
frame_opts
.
dither
=
0.0
;
opt
.
frame_opts
.
remove_dc_offset
=
false
;
opt
.
frame_opts
.
window_type
=
"hanning"
;
...
...
speechx/speechx/frontend/audio/fbank.cc
浏览文件 @
919c8d06
...
...
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/fbank.h"
#include "kaldi/base/kaldi-math.h"
#include "kaldi/feat/feature-common.h"
...
...
@@ -29,95 +28,33 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
std
::
vector
;
// todo refactor later:(SmileGoat)
Fbank
::
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
FbankComputer
::
FbankComputer
(
const
Options
&
opts
)
:
opts_
(
opts
),
computer_
(
opts
.
fbank_opts
),
window_function_
(
opts
.
fbank_opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
chunk_sample_size_
=
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
fbank_opts
.
frame_opts
.
samp_freq
);
}
computer_
(
opts
)
{}
void
Fbank
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
base_extractor_
->
Accept
(
inputs
);
int32
FbankComputer
::
Dim
()
const
{
return
opts_
.
mel_opts
.
num_bins
+
(
opts_
.
use_energy
?
1
:
0
);
}
bool
Fbank
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>
wav
(
chunk_sample_size_
);
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
bool
FbankComputer
::
NeedRawLogEnergy
()
{
return
opts_
.
use_energy
&&
opts_
.
raw_energy
;
}
// Compute spectrogram feat
bool
Fbank
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Resize
(
num_frames
*
Dim
());
Vector
<
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
Vector
<
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
// note: this online feature-extraction code does not support VTLN.
RealFft
(
&
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
// Compute feat
bool
FbankComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
RealFft
(
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
window
);
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
window
.
Dim
()
/
2
+
1
);
if
(
!
opts_
.
fbank_opts
.
use_power
)
{
SubVector
<
BaseFloat
>
power_spectrum
(
*
window
,
0
,
window
->
Dim
()
/
2
+
1
);
if
(
!
opts_
.
use_power
)
{
power_spectrum
.
ApplyPow
(
0.5
);
}
int32
mel_offset
=
((
opts_
.
fbank_opts
.
use_energy
&&
!
opts_
.
fbank_opts
.
htk_compat
)
?
1
:
0
);
int32
mel_offset
=
((
opts_
.
use_energy
&&
!
opts_
.
htk_compat
)
?
1
:
0
);
SubVector
<
BaseFloat
>
mel_energies
(
this_feature
,
mel_offset
,
opts_
.
fbank_opts
.
mel_opts
.
num_bins
);
*
feat
,
mel_offset
,
opts_
.
mel_opts
.
num_bins
);
mel_bank
.
Compute
(
power_spectrum
,
&
mel_energies
);
mel_energies
.
ApplyFloor
(
1e-07
);
mel_energies
.
ApplyLog
();
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
return
true
;
}
...
...
speechx/speechx/frontend/audio/fbank.h
浏览文件 @
919c8d06
...
...
@@ -15,6 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-mfcc.h"
...
...
@@ -22,56 +23,28 @@
namespace
ppspeech
{
struct
FbankOptions
{
kaldi
::
FbankOptions
fbank_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
FbankOptions
()
:
streaming_chunk
(
0.1
),
fbank_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
fbank_opts
.
Register
(
opts
);
}
};
class
Fbank
:
public
FrontendInterface
{
class
FbankComputer
{
public:
explicit
Fbank
(
const
FbankOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
typedef
kaldi
::
FbankOptions
Options
;
explicit
FbankComputer
(
const
Options
&
opts
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
kaldi
::
FrameExtractionOptions
&
GetFrameOptions
()
{
return
opts_
.
frame_opts
;
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
window
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
int32
Dim
()
const
;
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
bool
NeedRawLogEnergy
();
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
Options
opts_
;
FbankOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
FeatureWindowFunction
window_function_
;
kaldi
::
FbankComputer
computer_
;
// features_ is the Mfcc or Plp or Fbank features that we have already
// computed.
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
features_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
kaldi
::
int32
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
Fbank
);
DISALLOW_COPY_AND_ASSIGN
(
FbankComputer
);
};
typedef
StreamingFeatureTpl
<
FbankComputer
>
Fbank
;
}
// namespace ppspeech
speechx/speechx/frontend/audio/feature_common.h
0 → 100644
浏览文件 @
919c8d06
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "frontend_itf.h"
#include "kaldi/feat/feature-window.h"
namespace
ppspeech
{
template
<
class
F
>
class
StreamingFeatureTpl
:
public
FrontendInterface
{
public:
typedef
typename
F
::
Options
Options
;
StreamingFeatureTpl
(
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
}
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
Options
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
FeatureWindowFunction
window_function_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
F
computer_
;
};
}
// namespace ppspeech
#include "frontend/audio/feature_common_inl.h"
speechx/speechx/frontend/audio/feature_common_inl.h
0 → 100644
浏览文件 @
919c8d06
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace
ppspeech
{
template
<
class
F
>
StreamingFeatureTpl
<
F
>::
StreamingFeatureTpl
(
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
computer_
(
opts
),
window_function_
(
opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
}
template
<
class
F
>
void
StreamingFeatureTpl
<
F
>::
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
)
{
base_extractor_
->
Accept
(
waves
);
}
template
<
class
F
>
bool
StreamingFeatureTpl
<
F
>::
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
wav
(
base_extractor_
->
Dim
());
bool
flag
=
base_extractor_
->
Read
(
&
wav
);
if
(
flag
==
false
||
wav
.
Dim
()
==
0
)
return
false
;
// append remaned waves
int32
wav_len
=
wav
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
waves
(
left_len
+
wav_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
wav_len
).
CopyFromVec
(
wav
);
// compute speech feature
Compute
(
waves
,
feats
);
// cache remaned waves
kaldi
::
FrameExtractionOptions
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
frame_opts
);
int32
frame_shift
=
frame_opts
.
WindowShift
();
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
}
// Compute feat
template
<
class
F
>
bool
StreamingFeatureTpl
<
F
>::
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
frame_opts
.
WindowSize
();
int32
sample_rate
=
frame_opts
.
samp_freq
;
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
frame_opts
);
feats
->
Resize
(
num_frames
*
Dim
());
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
window
;
bool
need_raw_log_energy
=
computer_
.
NeedRawLogEnergy
();
for
(
int32
frame
=
0
;
frame
<
num_frames
;
frame
++
)
{
kaldi
::
BaseFloat
raw_log_energy
=
0.0
;
kaldi
::
ExtractWindow
(
0
,
waves
,
frame
,
frame_opts
,
window_function_
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
computer_
.
Compute
(
&
window
,
&
this_feature
);
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
}
return
true
;
}
}
// namespace ppspeech
speechx/speechx/frontend/audio/feature_pipeline.h
浏览文件 @
919c8d06
...
...
@@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
bool
to_float32
;
// true, only for linear feature
bool
use_fbank
;
LinearSpectrogramOptions
linear_spectrogram_opts
;
FbankOptions
fbank_opts
;
kaldi
::
FbankOptions
fbank_opts
;
FeatureCacheOptions
feature_cache_opts
;
AssemblerOptions
assembler_opts
;
...
...
speechx/speechx/frontend/audio/linear_spectrogram.cc
浏览文件 @
919c8d06
...
...
@@ -28,81 +28,32 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
std
::
vector
;
LinearSpectrogram
::
LinearSpectrogram
(
const
LinearSpectrogramOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
opts_
(
opts
),
feature_window_funtion_
(
opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
LinearSpectrogramComputer
::
LinearSpectrogramComputer
(
const
Options
&
opts
)
:
opts_
(
opts
)
{
kaldi
::
FeatureWindowFunction
feature_window_function
(
opts
.
frame_opts
);
int32
window_size
=
opts
.
frame_opts
.
WindowSize
();
int32
window_shift
=
opts
.
frame_opts
.
WindowShift
()
;
frame_length_
=
window_size
;
dim_
=
window_size
/
2
+
1
;
chunk_sample_size_
=
static_cast
<
int32
>
(
opts
.
streaming_chunk
*
opts
.
frame_opts
.
samp_freq
);
hanning_window_energy_
=
kaldi
::
VecVec
(
feature_window_funtion_
.
window
,
feature_window_funtion_
.
window
);
}
void
LinearSpectrogram
::
Accept
(
const
VectorBase
<
BaseFloat
>&
inputs
)
{
base_extractor_
->
Accept
(
inputs
);
}
bool
LinearSpectrogram
::
Read
(
Vector
<
BaseFloat
>*
feats
)
{
Vector
<
BaseFloat
>
input_feats
(
chunk_sample_size_
);
bool
flag
=
base_extractor_
->
Read
(
&
input_feats
);
if
(
flag
==
false
||
input_feats
.
Dim
()
==
0
)
return
false
;
int32
feat_len
=
input_feats
.
Dim
();
int32
left_len
=
remained_wav_
.
Dim
();
Vector
<
BaseFloat
>
waves
(
feat_len
+
left_len
);
waves
.
Range
(
0
,
left_len
).
CopyFromVec
(
remained_wav_
);
waves
.
Range
(
left_len
,
feat_len
).
CopyFromVec
(
input_feats
);
Compute
(
waves
,
feats
);
int32
frame_shift
=
opts_
.
frame_opts
.
WindowShift
();
int32
num_frames
=
kaldi
::
NumFrames
(
waves
.
Dim
(),
opts_
.
frame_opts
);
int32
left_samples
=
waves
.
Dim
()
-
frame_shift
*
num_frames
;
remained_wav_
.
Resize
(
left_samples
);
remained_wav_
.
CopyFromVec
(
waves
.
Range
(
frame_shift
*
num_frames
,
left_samples
));
return
true
;
BaseFloat
hanning_window_energy
=
kaldi
::
VecVec
(
feature_window_function
.
window
,
feature_window_function
.
window
);
int32
sample_rate
=
opts
.
frame_opts
.
samp_freq
;
scale_
=
2.0
/
(
hanning_window_energy
*
sample_rate
);
}
// Compute spectrogram feat
bool
LinearSpectrogram
::
Compute
(
const
Vector
<
BaseFloat
>&
waves
,
Vector
<
BaseFloat
>*
feats
)
{
int32
num_samples
=
waves
.
Dim
();
int32
frame_length
=
opts_
.
frame_opts
.
WindowSize
();
int32
sample_rate
=
opts_
.
frame_opts
.
samp_freq
;
BaseFloat
scale
=
2.0
/
(
hanning_window_energy_
*
sample_rate
);
if
(
num_samples
<
frame_length
)
{
return
true
;
}
int32
num_frames
=
kaldi
::
NumFrames
(
num_samples
,
opts_
.
frame_opts
);
feats
->
Resize
(
num_frames
*
dim_
);
Vector
<
BaseFloat
>
window
;
for
(
int
frame_idx
=
0
;
frame_idx
<
num_frames
;
++
frame_idx
)
{
kaldi
::
ExtractWindow
(
0
,
waves
,
frame_idx
,
opts_
.
frame_opts
,
feature_window_funtion_
,
&
window
,
NULL
);
SubVector
<
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame_idx
*
dim_
,
dim_
);
window
.
Resize
(
frame_length
,
kaldi
::
kCopyData
);
RealFft
(
&
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
&
window
);
SubVector
<
BaseFloat
>
power_spectrum
(
window
,
0
,
dim_
);
power_spectrum
.
Scale
(
scale
);
bool
LinearSpectrogramComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
window
->
Resize
(
frame_length_
,
kaldi
::
kCopyData
);
RealFft
(
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
window
);
SubVector
<
BaseFloat
>
power_spectrum
(
*
window
,
0
,
dim_
);
power_spectrum
.
Scale
(
scale_
);
power_spectrum
(
0
)
=
power_spectrum
(
0
)
/
2
;
power_spectrum
(
dim_
-
1
)
=
power_spectrum
(
dim_
-
1
)
/
2
;
power_spectrum
.
Add
(
1e-14
);
power_spectrum
.
ApplyLog
();
output_row
.
CopyFromVec
(
power_spectrum
);
}
feat
->
CopyFromVec
(
power_spectrum
);
return
true
;
}
...
...
speechx/speechx/frontend/audio/linear_spectrogram.h
浏览文件 @
919c8d06
...
...
@@ -16,6 +16,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-window.h"
...
...
@@ -23,47 +24,34 @@ namespace ppspeech {
struct
LinearSpectrogramOptions
{
kaldi
::
FrameExtractionOptions
frame_opts
;
kaldi
::
BaseFloat
streaming_chunk
;
// second
LinearSpectrogramOptions
()
:
streaming_chunk
(
0.1
),
frame_opts
()
{}
void
Register
(
kaldi
::
OptionsItf
*
opts
)
{
opts
->
Register
(
"streaming-chunk"
,
&
streaming_chunk
,
"streaming chunk size, default: 0.1 sec"
);
frame_opts
.
Register
(
opts
);
}
LinearSpectrogramOptions
()
:
frame_opts
()
{}
};
class
LinearSpectrogram
:
public
FrontendInterface
{
class
LinearSpectrogram
Computer
{
public:
explicit
LinearSpectrogram
(
const
LinearSpectrogramOptions
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
inputs
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
typedef
LinearSpectrogramOptions
Options
;
explicit
LinearSpectrogramComputer
(
const
Options
&
opts
);
kaldi
::
FrameExtractionOptions
&
GetFrameOptions
()
{
return
opts_
.
frame_opts
;
}
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
window
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feat
);
size_t
dim_
;
kaldi
::
FeatureWindowFunction
feature_window_funtion_
;
kaldi
::
BaseFloat
hanning_window_energy_
;
LinearSpectrogramOptions
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
remained_wav_
;
int
chunk_sample_size_
;
DISALLOW_COPY_AND_ASSIGN
(
LinearSpectrogram
);
int32
Dim
()
const
{
return
dim_
;
}
bool
NeedRawLogEnergy
()
{
return
false
;
}
private:
kaldi
::
BaseFloat
scale_
;
Options
opts_
;
int32
frame_length_
;
int32
dim_
;
DISALLOW_COPY_AND_ASSIGN
(
LinearSpectrogramComputer
);
};
typedef
StreamingFeatureTpl
<
LinearSpectrogramComputer
>
LinearSpectrogram
;
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/utils/CMakeLists.txt
浏览文件 @
919c8d06
add_library
(
utils
file_utils.cc
simdjson.cpp
)
\ No newline at end of file
speechx/speechx/utils/simdjson.cpp
已删除
100644 → 0
浏览文件 @
8b1c1ec4
此差异已折叠。
点击以展开。
speechx/speechx/utils/simdjson.h
已删除
100644 → 0
浏览文件 @
8b1c1ec4
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录